# Transformação de dados 

In [25]:
# Bibliotecas 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline 

# Parametros próprio
from sklearn.model_selection import train_test_split # Utilizado para treinamento e teste
from sklearn import preprocessing                     # Pré processamento de dados
from sklearn.linear_model import LinearRegression     # Vizualização regressão linear. 

np.set_printoptions(threshold=None, precision=2)      # Número de casas decimais (Numpy)
pd.set_option('display.max_columns', 500)             # Número de colunas (Pandas)
pd.set_option('display.max_rows', 500)                # Número de linhas (Pandas)
pd.set_option('precision', 2)                         # Número de decimais (Pandas)

In [18]:
titanic = pd.read_csv('E:/PUC-MG/Machine learning/titanic.csv', sep=',')
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.10,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.00,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.00,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.00,C148,C


In [19]:
# Remover PassengerID e TICKET 
del titanic ['PassengerId'] 
del titanic ['Ticket']
titanic

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.28,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.92,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.10,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.00,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.00,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.45,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.00,C148,C


In [30]:
# Alterar de campos NÚMERICOS para CATEGORICOS. Ex.: De: 0 e 1, Para: Morreu e Sobreviveu 
new_survived = pd.Categorical(titanic['Survived'])                     # Alterando de Númerico para categorico
new_survived= new_survived.rename_categories(['Morreu', 'Sobreviveu']) # Alterar de 0 e 1. Para 0 = Morreu e 1 = Sobreviveu
titanic['Survived'] = new_survived                                     # Substituir a coluna.

new_Pclass = pd.Categorical(titanic['Pclass'], ordered=True)                    # Alterando de 'Númerico' p/ 'Categorico'
new_Pclass = new_Pclass.rename_categories(['1aClasse', '2aClasse', '3aClasse']) # Alterar de 1,2,3. Para 1°,2°,3° classe. 
titanic['Pclass'] = new_Pclass
titanic

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,Morreu,3aClasse,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,Sobreviveu,1aClasse,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.28,C85,C
2,Sobreviveu,3aClasse,"Heikkinen, Miss. Laina",female,26.0,0,0,7.92,,S
3,Sobreviveu,1aClasse,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.10,C123,S
4,Morreu,3aClasse,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S
...,...,...,...,...,...,...,...,...,...,...
886,Morreu,2aClasse,"Montvila, Rev. Juozas",male,27.0,0,0,13.00,,S
887,Sobreviveu,1aClasse,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.00,B42,S
888,Morreu,3aClasse,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.45,,S
889,Sobreviveu,1aClasse,"Behr, Mr. Karl Howell",male,26.0,0,0,30.00,C148,C


In [31]:
# Converter o formato e list comprehension (Remover as informações e permanecer somente com uma parte. Ex.: DE: A10, PARA: A ) 

char_cabin = titanic['Cabin'].astype(str) # converter o Cabin para String 

# Selecionar apenas a primeira letra 
new_cabin = pd.Categorical([cabin[0] for cabin in char_cabin]) # Pega todos os elementos do 'Char_cabin' e acrescenta em 'cabin'
titanic['cabin'] = new_cabin
titanic['cabin']

0      n
1      C
2      n
3      C
4      n
      ..
886    n
887    B
888    n
889    C
890    n
Name: cabin, Length: 891, dtype: category
Categories (9, object): [A, B, C, D, ..., F, G, T, n]