In [1]:
import pandas as pd
from itertools import product
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder

In [2]:
def transforme(V:str):
    try:return int(V.split()[-1])
    except:return V
        
def transform_ticket(tickets:list[str])->list[float]:
    nouveaux=[transforme(V) for V in tickets]
    moyenne=np.mean([n for n in nouveaux if isinstance(n,int)])
    for i,v in enumerate(nouveaux):
        if type(v)==str:nouveaux[i]=moyenne
    return nouveaux

def isNaN(num):
    return num != num

In [3]:
df1=pd.read_csv('train.csv')
df2=pd.read_csv('test.csv')
y=df1.Survived.values.copy()
df1.drop(columns=["Survived"],inplace=True)
df=pd.concat([df1,df2],axis=0)
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1046 non-null   float64
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Ticket       1309 non-null   object 
 8   Fare         1308 non-null   float64
 9   Cabin        295 non-null    object 
 10  Embarked     1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 122.7+ KB


Stratégie pour transformer les colonnes object
- Sex --> 0 ou 1 : appliquer f=lambda s:1*(s=='male')
- Name --> tfidf vectorizer de scikit learn
- Ticket : un split() puis garder le int du dernier (position -1 dans le split)
Stratégie pour remplacer les valeurs manquantes
- age : age moyen par sex et Pclass
- Cabin : valeur majoritaire de lettre de cabine par Pclass
- Embarked : remplacer les manquantes par "X"
- Fare : remplacer les manquantes par moyenne de Fare par Pclass

In [5]:
df.Sex=df.Sex.apply(lambda s:1*(s=='male'))
df.Ticket=transform_ticket(df.Ticket.values)
dico_age={(s,p):df.Age[(df.Sex==s)*(df.Pclass==p)==1].mean() 
          for s,p in product(df.Sex.unique(),df.Pclass.unique())}
manquants=np.where(df.Age.isnull())[0]
df.Age=[dico_age[(df.Sex.iloc[i],df.Pclass.iloc[i])] 
        if i in manquants else df.Age.iloc[i] for i in range(len(df))]
df.Embarked.fillna("X",inplace=True)

In [6]:
dico_class={p:Counter([v[0] for v in df.Cabin[df.Pclass==p] 
                       if not isNaN(v)]).most_common(1)[0][0]
            for p in df.Pclass.unique()}
df.Cabin=[v[0] if not isNaN(v) else dico_class[df.Pclass.iloc[i]] 
          for i,v in enumerate(df.Cabin.values)]

In [7]:
dico_fare={p:df.Fare[df.Pclass==p].mean() for p in {1,2,3}}
df.Fare=[v if not isNaN(v) else dico_fare[df.Pclass.iloc[i]] for 
        i,v in enumerate(df.Fare)]

In [8]:
df.Cabin.unique(),df.Embarked.unique()

(array(['F', 'C', 'E', 'G', 'D', 'A', 'B', 'T'], dtype=object),
 array(['S', 'C', 'Q', 'X'], dtype=object))

In [9]:
C_cabin=OneHotEncoder(drop='first',sparse_output=False)
C_cabin.fit(np.reshape(df.Cabin.values,(-1,1)))
transformees=C_cabin.transform(np.reshape(df.Cabin.values,(-1,1)))
print(type(transformees))
print(C_cabin.get_feature_names_out())
colonnes=['Cabin_'+f[-1] for f in C_cabin.get_feature_names_out()]
print(colonnes)
cabines=pd.DataFrame(columns=colonnes,data=transformees)
cabines.index=df.index
df=pd.concat([df.copy(),cabines],axis=1)
df.drop(columns=['Cabin'],inplace=True)


<class 'numpy.ndarray'>
['x0_B' 'x0_C' 'x0_D' 'x0_E' 'x0_F' 'x0_G' 'x0_T']
['Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_T']


In [10]:
C_embarked=OneHotEncoder(drop='first',sparse_output=False)
C_embarked.fit(np.reshape(df.Embarked.values,(-1,1)))
transformees=C_embarked.transform(np.reshape(df.Embarked.values,(-1,1)))
print(type(transformees))
print(C_embarked.get_feature_names_out())
colonnes=['Port_'+f[-1] for f in C_embarked.get_feature_names_out()]
print(colonnes)
embarquement=pd.DataFrame(columns=colonnes,data=transformees)
embarquement.index=df.index
df=pd.concat([df.copy(),embarquement],axis=1)
df.drop(columns=['Embarked'],inplace=True)
df.columns

<class 'numpy.ndarray'>
['x0_Q' 'x0_S' 'x0_X']
['Port_Q', 'Port_S', 'Port_X']


Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F',
       'Cabin_G', 'Cabin_T', 'Port_Q', 'Port_S', 'Port_X'],
      dtype='object')

In [11]:
df1=df.iloc[:891,:]
df1["Survived"]=y
df2=df.iloc[891:,:]
df1.shape,df2.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["Survived"]=y


((891, 20), (418, 19))

In [12]:
df1.to_csv("train_net.csv")
df2.to_csv("test_net.csv")

In [13]:
df0=pd.read_csv("train_net.csv")
df0.head(2)

Unnamed: 0.1,Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Port_Q,Port_S,Port_X,Survived
0,0,1,3,"Braund, Mr. Owen Harris",1,22.0,1,0,21171.0,7.25,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
1,1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,17599.0,71.2833,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
