# prepro

In [9]:
import pandas as pd
import seaborn as sns
from package_ml.get_data import Data
from package_ml.preprocessing import Preprocessing
from package_ml.model import Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings("ignore")

In [2]:
do = Data('titanic') 

In [3]:
titanic_data = do.get_data()

In [4]:
titanic_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
titanic_data['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [16]:
titanic_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

# pipeline

In [19]:
numerical_features = ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ["Sex", "Embarked"]
# categorical_features = titanic_data.select_dtypes(include = ['object']).columns

In [20]:
numerical_pipeline = make_pipeline(SimpleImputer(missing_values=np.nan, strategy='mean'), StandardScaler())
categorical_pipeline = make_pipeline(SimpleImputer(missing_values=np.nan, strategy='most_frequent'), OneHotEncoder())

In [21]:
preprocessor = make_column_transformer(
    (numerical_pipeline, numerical_features), (categorical_pipeline, categorical_features))

In [22]:
processed = preprocessor.fit_transform(titanic_data)

In [23]:
df = pd.DataFrame(processed, columns = titanic_data.columns)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,-1.730108,-0.789272,0.827377,-0.592481,0.432793,-0.473674,-0.502445,0.0,1.0,0.0,0.0,1.0
1,-1.726220,1.266990,-1.566107,0.638789,0.432793,-0.473674,0.786845,1.0,0.0,1.0,0.0,0.0
2,-1.722332,1.266990,0.827377,-0.284663,-0.474545,-0.473674,-0.488854,1.0,0.0,0.0,0.0,1.0
3,-1.718444,1.266990,-1.566107,0.407926,0.432793,-0.473674,0.420730,1.0,0.0,0.0,0.0,1.0
4,-1.714556,-0.789272,0.827377,0.407926,-0.474545,-0.473674,-0.486337,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,1.714556,-0.789272,-0.369365,-0.207709,-0.474545,-0.473674,-0.386671,0.0,1.0,0.0,0.0,1.0
887,1.718444,1.266990,-1.566107,-0.823344,-0.474545,-0.473674,-0.044381,1.0,0.0,0.0,0.0,1.0
888,1.722332,-0.789272,0.827377,0.000000,0.432793,2.008933,-0.176263,1.0,0.0,0.0,0.0,1.0
889,1.726220,1.266990,-1.566107,-0.284663,-0.474545,-0.473674,-0.044381,0.0,1.0,1.0,0.0,0.0


In [None]:
preproc = Preprocessing(titanic_data)

In [None]:
preproc.drop('Cabin')

In [None]:
preproc.mean_inputer('Age')

In [None]:
preproc.mode_inputer('Embarked')

In [None]:
preproc.encoding('Sex','Embarked')

In [None]:
titanic_data = preproc.final_df()

In [None]:
titanic_data['Pclass'].unique()

# model

In [None]:
ml = Model(titanic_data)

In [None]:
X = ml.X_features_drop(['PassengerId','Name','Ticket','Survived'])

In [None]:
X.shape

In [None]:
y = ml.y_target(['Survived'])

In [None]:
X_train, X_test, y_train, y_test = ml.split(X,y,test_size=0.2,random_state=2)

In [None]:
X_train.shape, y_train.shape

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
X_train_prediction = model.predict(X_train)

training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print('Accuracy score of training data : ', training_data_accuracy)

X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test, X_test_prediction)
print('Accuracy score of test data : ', test_data_accuracy)

In [None]:
X.columns

In [None]:
ml.prediction(model = model ,pclass = 3, sex=0, age=35, sibsp=0, parch=0, fare=8.05, embarked=0)

In [None]:
X_test

In [None]:
model.predict(X_test)

In [None]:
y_test.shape, X_test.shape

In [None]:
X.shape, y.shape

# Visualisation

In [None]:
# répartition des survivants par sexe

f_v = 0
f_m = 0
h_v = 0
h_m = 0
for features, target in zip(X.values, y.values):
    if int(features[1]) == 1 and target[0] == 1:
        f_v += 1
    elif int(features[1]) == 1 and target[0] == 0:
        f_m += 1
    elif int(features[1]) == 0 and target[0] == 1:
        h_v += 1
    elif int(features[1]) == 0 and target[0] == 0:
        h_m += 1
#     print(features[1], target[0])
        
# synthaxe à utiliser pour utilisation en htlm "liste de dict" 
[{"Nombre de femmes ayant survécu":f_v},{"Nombre de femmes n'ayant pas survécu":f_m},{"Nombre d'hommes ayant survécu":h_v},{"Nombre d'hommes n'ayant pas survécu":h_m}]

In [None]:
# répartition des survivants par pclass

pclass1_v = 0
pclass1_m = 0
pclass2_v = 0
pclass2_m = 0
pclass3_v = 0
pclass3_m = 0

for features, target in zip(X.values, y.values):
    if int(features[0]) == 1 and target[0] == 1:
        pclass1_v += 1
    elif int(features[0]) == 1 and target[0] == 0:
        pclass1_m += 1
    elif int(features[0]) == 2 and target[0] == 1:
        pclass2_v += 1
    elif int(features[0]) == 2 and target[0] == 0:
        pclass2_m += 1
    elif int(features[0]) == 3 and target[0] == 1:
        pclass3_v += 1
    elif int(features[0]) == 3 and target[0] == 0:
        pclass3_m += 1
[{"Nombre de survivants ayant un ticket en 1ère classe":pclass1_v},{"Nombre de morts ayant un ticket en 1ère classe":pclass1_m},
 {"Nombre de survivants ayant un ticket en 2ème classe":pclass2_v},{"Nombre de morts ayant un ticket en 2ème classe":pclass2_m},
{"Nombre de survivants ayant un ticket en 3ème classe":pclass3_v},{"Nombre de morts ayant un ticket en 3ème classe":pclass3_m}]

In [None]:
X['Age'].describe()

In [None]:
# enfants : 0-10
# adolescents : 10-18
# adultes : 18-50
# personnes âgées : 50 et +
sns.histplot(X['Age'])

In [None]:
# répartition par tranche d'âge
# age = pd.cut(X['Age'], bins = [0, 18, 50, 80])

child_v = 0
child_m = 0
adult_v = 0
adult_m = 0
old_v = 0
old_m = 0

for age, target in zip(X.values, y.values):
    if int(age[2]) >= 0 and int(age[2]) <= 18 and target[0] == 1:
        child_v += 1
    elif int(age[2]) >= 0 and int(age[2]) <= 18 and target[0] == 0:
        child_m += 1
    elif int(age[2]) > 18 and int(age[2]) <= 55 and target[0] == 1:
        adult_v += 1
    elif int(age[2]) > 18 and int(age[2]) <= 55 and target[0] == 0:
        adult_m += 1
    elif int(age[2]) > 55 and target[0] == 0:
        old_v += 1
    elif  int(age[2]) > 55 and target[0] == 1:
        old_m += 1
        
        
[{"name": "Nombre d'enfants ayant survécu'","pourcent": child_v},
        {"name":"Nombre d'enfants n'ayant pas survécu","pourcent":child_m},
        {"name":"Nombre d'adultes n'ayant pas survécu","pourcent":adult_v},
        {"name":"Nombre d'adultes n'ayant pas survécu","pourcent":adult_m},
        {"name":"Nombre de personnes âgées n'ayant pas survécu","pourcent":old_v},
        {"name":"Nombre de personnes âgées n'ayant pas survécu","pourcent":old_m}]

In [None]:
np.amax(X['Age'].values)

In [None]:
age.value_counts()

In [None]:
age.values

# Visualisation par avec ou sans famille