In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
trainData = pd.read_csv('../data/train.csv')

In [3]:
trainData.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Working the Data

In [4]:
# Label Encode Data
label_encoder = preprocessing.LabelEncoder()

trainData['SexCode'] = label_encoder.fit_transform(trainData['Sex'])
trainData['EmbarkedCode'] = label_encoder.fit_transform(trainData['Embarked'])

In [5]:
trainData.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SexCode,EmbarkedCode
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,2


In [None]:
# Fill missing Age values
trainData['Age'].fillna((trainData['Age'].mean()), inplace=True)
trainData['Age'] = trainData['Age'].apply(lambda age: int(age))
trainData['Age']

In [7]:
filteredTrainData = pd.DataFrame(
    columns=[
        'Survived', 'Age','Pclass', 'SibSp', 'Parch', 'SexCode', 'EmbarkedCode'
    ]
)
filteredTrainData['Survived'] = trainData['Survived']
filteredTrainData['Age'] = trainData['Age']
filteredTrainData['Pclass'] = trainData['Pclass']
filteredTrainData['SibSp'] = trainData['SibSp']
filteredTrainData['Parch'] = trainData['Parch']
filteredTrainData['SexCode'] = trainData['SexCode']
filteredTrainData['EmbarkedCode'] = trainData['EmbarkedCode']

In [8]:
filteredTrainData.head()

Unnamed: 0,Survived,Age,Pclass,SibSp,Parch,SexCode,EmbarkedCode
0,0,22,3,1,0,1,2
1,1,38,1,1,0,0,0
2,1,26,3,0,0,0,2
3,1,35,1,1,0,0,2
4,0,35,3,0,0,1,2


### Put data values into same scale

In [12]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaledTrainData = pd.DataFrame(
    columns=[
        'Age','Pclass', 'SibSp', 'Parch', 'SexCode', 'EmbarkedCode'
    ]
)

scaledTrainData['Age'] = filteredTrainData['Age']
scaledTrainData['Pclass'] = filteredTrainData['Pclass']
scaledTrainData['SibSp'] = filteredTrainData['SibSp']
scaledTrainData['Parch'] = filteredTrainData['Parch']
scaledTrainData['SexCode'] = filteredTrainData['SexCode']
scaledTrainData['EmbarkedCode'] = filteredTrainData['EmbarkedCode']

In [10]:
scaledTrainData.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,SexCode,EmbarkedCode
0,22,3,1,0,1,2
1,38,1,1,0,0,0
2,26,3,0,0,0,2
3,35,1,1,0,0,2
4,35,3,0,0,1,2


In [13]:
# Adjust the scale of the values in the columns
#StandardScaler
df_scaled = pd.DataFrame(StandardScaler().fit_transform(scaledTrainData), columns=scaledTrainData.columns)

In [16]:
df_scaled['Survived'] = filteredTrainData['Survived']

In [17]:
df_scaled.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,SexCode,EmbarkedCode,Survived
0,-0.580044,0.827377,0.432793,-0.473674,0.737695,0.581114,0
1,0.650112,-1.566107,0.432793,-0.473674,-1.355574,-1.93846,1
2,-0.272505,0.827377,-0.474545,-0.473674,-1.355574,0.581114,1
3,0.419458,-1.566107,0.432793,-0.473674,-1.355574,0.581114,1
4,0.419458,0.827377,-0.474545,-0.473674,0.737695,0.581114,0


### Split train test data

In [23]:
x = df_scaled.iloc[:,0:6].values
y = df_scaled.iloc[:,6]

In [26]:
x

array([[-0.58004441,  0.82737724,  0.43279337, -0.47367361,  0.73769513,
         0.58111394],
       [ 0.65011226, -1.56610693,  0.43279337, -0.47367361, -1.35557354,
        -1.93846038],
       [-0.27250525,  0.82737724, -0.4745452 , -0.47367361, -1.35557354,
         0.58111394],
       ...,
       [-0.04185087,  0.82737724,  0.43279337,  2.00893337, -1.35557354,
         0.58111394],
       [-0.27250525, -1.56610693, -0.4745452 , -0.47367361,  0.73769513,
        -1.93846038],
       [ 0.18880351,  0.82737724, -0.4745452 , -0.47367361,  0.73769513,
        -0.67867322]])

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [29]:
len(y_train)

668

## Models Trainments

### Decision Tree Model

In [30]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(x_train,y_train)

In [31]:
# Check model accuracy
predictions = decision_tree_model.predict(x_test)

confusao = confusion_matrix(y_test, predictions)

modelAccuracy = accuracy_score(y_test, predictions)
modelAccuracy

0.7713004484304933

### SVM Model

In [32]:
from sklearn.svm import SVC

In [33]:
svm = SVC(kernel='linear')
svm.fit(x_train, y_train)

# show model results
smv_predictions = svm.predict(x_test)

model_accuracy = accuracy_score(y_test, smv_predictions)
model_accuracy

0.7802690582959642

In [35]:
# pip install xgboost
import xgboost as xgb

In [36]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [37]:
# Train xgb model
xg_reg.fit(x_train,y_train)



In [39]:
# xgb Prediction
xgb_prediction = xg_reg.predict(x_test)

In [None]:
xgb_prediction

In [None]:
# xgb accuracy score
model_accuracy = accuracy_score(y_test, xgb_prediction)
model_accuracy