In [14]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.metrics import accuracy_score, log_loss
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import CalibratedClassifierCV
%matplotlib inline

trd = pd.read_csv('train.csv')
tsd = pd.read_csv('test.csv')
df = pd.concat([trd, tsd], ignore_index=True, sort  = False)

In [2]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
df.info()

In [15]:
x = df['Age'].dropna().unique()
y = df['Age'].value_counts()

In [17]:
x1 = df['Sex'].dropna().unique()
y1 = df['Sex'].value_counts()

In [16]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y))
fig.add_trace(go.Scatter(x=x1, y=y1))
fig.show()

In [None]:
df.isna().sum()

In [None]:
df = df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)    

In [None]:
df = df.dropna()

In [None]:
df['Age'].unique()

In [None]:
df['Age'] = pd.cut(df.Age, [0, 10, 20, 30, 40, 50, 60, 70, 80])

In [None]:
fig_sex = (trd.Sex.value_counts(normalize = True) * 100).plot.bar()
male_pr = round((trd[trd.Sex == 'male'].Survived == 1).value_counts()[1]/len(trd.Sex) * 100, 2)
female_pr = round((trd[trd.Sex == 'female'].Survived == 1).value_counts()[1]/len(trd.Sex) * 100, 2)
sex_perc_df = pd.DataFrame(
    { "Percentage Survived":{"male": male_pr,"female": female_pr},  "Percentage Not Survived":{"male": 100-male_pr,"female": 100-female_pr}})
sex_perc_df.plot.barh().set_title("Percentage of male and female survived and Deceased")
fig_sex

In [None]:
survived = df[df['Survived']==1][x].value_counts()
dead = df[df['Survived']==0][x].value_counts()

In [None]:
def fig_chart(feature, df):
    describe = df['feature'].unique()
    update_sur = {}
    update_dead = {}
    for x in describe:
        
        x_pr = round((df[df.feature == x].Survived == 1).value_counts()[1]/len(df.feature) * 100, 2)
        update_sur[x] = x_pr
        update_dead[x] = 100-x_pr      
    
    df = pd.DataFrame({"Survived, %": update_sur, "Not Survived, %": update_dead})
    df.plot.barh().set_title("Percentage of survived and Deceased")

In [None]:
fig_chart(Sex, df)

In [None]:
describe = df['feature'].unique()
update_sur = {}
update_dead = {}
for x in describe:
    x_pr = round((df[df.feature == x].Survived == 1).value_counts()[1]/len(df.feature) * 100, 2)
    update_sur[x] = x_pr
    update_dead[x] = 100-x_pr      
    
df = pd.DataFrame({"Survived, %": update_sur, "Not Survived, %": update_dead})
df.plot.barh().set_title("Percentage of survived and Deceased")

In [None]:
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])

In [None]:
#Создаем фрейм для предсказания
X_to_be_predicted = tsd.drop(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], axis=1)
X_to_be_predicted['Sex'] = LabelEncoder().fit_transform(X_to_be_predicted['Sex'])

In [None]:
X_to_be_predicted.shape

In [None]:
#Подготовка обучающей и тестовых выборок
train_data = df #Выбираем фрейм
train_data = train_data.dropna() #В новом фрейме чистим от пустых строк
y = train_data['Survived'] #Зависимая переменная
X = train_data.drop(['Survived'], axis = 1) #Предикторы
train_data.shape 

In [None]:
#Расщепление на обучающую и тестовые выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,                                                    
                                                    test_size=0.2 # доля объёма тестового множества
                                                   )

In [None]:
model = RandomForestClassifier(criterion='gini', 
                               n_estimators=300,
                               min_samples_split=10,
                               min_samples_leaf=1,
                               max_depth=3, #Макс число слоев 
                               max_features='auto',
                               oob_score=True,
                               random_state=42,
                               n_jobs=-1,
                               warm_start=False, #использовать результаты предыдущего вызова и нарастить предыдущий лес
                               class_weight=None )

model.fit(X_train, y_train)
result = model.predict(X_to_be_predicted)

print("RF Accuracy: "+repr(round(model.score(X_test, y_test) * 100, 2)) + "%")

result_rf=cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy')

print('The cross validated score for Random forest is:',round(result_rf.mean()*100,2))
y_pred = cross_val_predict(model, X_train, y_train, cv=10)

#Важность вклада предикторов
pd.DataFrame({'feature': X.columns,
              'importance': model.feature_importances_}).sort_values('importance', ascending=False)

In [None]:
submission = pd.DataFrame({'PassengerId':X_to_be_predicted.PassengerId,'Survived':result})
submission.Survived = submission.Survived.astype(int)
print(submission.shape)
filename = 'Titanic_test_predictions.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)