In [181]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Data Extraction
path_traindata = "data/train.csv"
path_testdata = "data/test.csv"
df_train = pd.read_csv(path_traindata)
df_test = pd.read_csv(path_testdata)

df_all = pd.concat([df_train, df_test], ignore_index = True)

In [182]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [183]:
df_all.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


![](images/Cabin.png)

In [184]:
df_all['Cabin'].nunique()

186

In [185]:
df_all['Cabin'].value_counts()

Cabin
C23 C25 C27        6
G6                 5
B57 B59 B63 B66    5
C22 C26            4
F33                4
                  ..
A14                1
E63                1
E12                1
E38                1
C105               1
Name: count, Length: 186, dtype: int64

In [186]:
# only use the head
df_all['Cabin'].fillna('U',inplace=True)
df_all['Cabin'] = df_all['Cabin'].map(lambda s: s[0])

In [187]:
df_all['Cabin'].nunique()

9

In [188]:
df_all['Cabin'].value_counts()

Cabin
U    1014
C      94
B      65
D      46
E      41
A      22
F      21
G       5
T       1
Name: count, dtype: int64

In [189]:
df_all[df_all['Cabin']=='T']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
339,340,0.0,1,"Blackwell, Mr. Stephen Weart",male,45.0,0,0,113784,35.5,T,S


In [190]:
# assume the fare correlates with cabin
def cabin_estimator(i):
    """Grouping cabin feature by the first letter"""
    a = 0
    if i<16:
        a = "G"
    elif i>=16 and i<27:
        a = "F"
    elif i>=27 and i<47:
        a = "A"
    elif i>= 47 and i<53:
        a = "E"
    elif i>= 53 and i<54:
        a = "D"
    elif i>=54 and i<116:
        a = 'C'
    else:
        a = "B"
    return a

df_all.loc[df_all['Cabin'] == 'U','Cabin'] = df_all[df_all['Cabin'] == 'U']['Fare'].apply(lambda x: cabin_estimator(x))
df_all.loc[df_all['Cabin'] == 'T','Cabin'] = df_all[df_all['Cabin'] == 'T']['Fare'].apply(lambda x: cabin_estimator(x))
df_all['Cabin']

0       G
1       C
2       G
3       C
4       G
       ..
1304    G
1305    C
1306    G
1307    G
1308    F
Name: Cabin, Length: 1309, dtype: object

In [191]:
df_all['Cabin'].value_counts()

Cabin
G    686
F    192
C    138
A    123
B     78
E     46
D     46
Name: count, dtype: int64

In [192]:
df_all['Title'] = df_all['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())



In [193]:
df_all['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [194]:
df_all['Title'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','the Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],
                        ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Mrs'],inplace=True)

In [195]:
df_all['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Other'], dtype=object)

In [196]:
df_all.groupby("Title")['Age'].mean().sort_values()

Title
Master     5.482642
Miss      21.834533
Mr        32.545531
Mrs       37.046243
Other     44.923077
Name: Age, dtype: float64

In [197]:
df_all.loc[(df_all['Age'].isnull())&(df_all['Title']=='Mr'),'Age']=33
df_all.loc[(df_all['Age'].isnull())&(df_all['Title']=='Mrs'),'Age']=37
df_all.loc[(df_all['Age'].isnull())&(df_all['Title']=='Master'),'Age']=5
df_all.loc[(df_all['Age'].isnull())&(df_all['Title']=='Miss'),'Age']=22
df_all.loc[(df_all['Age'].isnull())&(df_all['Title']=='Other'),'Age']=45

# same process
df_all['Embarked'].fillna('S',inplace=True)
df_all['FamilySize'] = df_all['SibSp'] + df_all['Parch'] + 1
# print(df_all['FamilySize'])

df_all['Age'] = pd.cut(df_all['Age'], 5)
# cut divides the data into a specified number of equi-width intervals
# print(df_all['Age'].value_counts())

# print(df_all['Fare'].describe())

# print(df_all['Fare'].describe(percentiles = [0.6,0.9,0.98]))
# print(df_all[df_all['Fare'] > 300])

bins = [0, 14, 78, 220, 500, 600]
labels = ['VeryLow','Low', 'Middle', 'High', 'VeryHigh']
df_all['Fare'] = pd.cut(df_all['Fare'], bins=bins, labels=labels, right=False)
# print(df_all['Fare'].value_counts())

In [198]:
## One Hot Encoding
from sklearn.preprocessing import LabelEncoder

map_features = ['Cabin','Sex', 'Title', 'Age', 'Fare','Embarked']
for feature in map_features:
    df_all[feature] = LabelEncoder().fit_transform(df_all[feature])

map_features_2 = ['Pclass','Sex','Title','Cabin','Embarked','FamilySize','Fare','Age']
encoded_features = pd.get_dummies(df_all[map_features_2], columns=map_features_2)


In [199]:
encoded_features.columns

Index(['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_0', 'Sex_1', 'Title_0',
       'Title_1', 'Title_2', 'Title_3', 'Title_4', 'Cabin_0', 'Cabin_1',
       'Cabin_2', 'Cabin_3', 'Cabin_4', 'Cabin_5', 'Cabin_6', 'Embarked_0',
       'Embarked_1', 'Embarked_2', 'FamilySize_1', 'FamilySize_2',
       'FamilySize_3', 'FamilySize_4', 'FamilySize_5', 'FamilySize_6',
       'FamilySize_7', 'FamilySize_8', 'FamilySize_11', 'Fare_0', 'Fare_1',
       'Fare_2', 'Fare_3', 'Fare_4', 'Fare_5', 'Age_0', 'Age_1', 'Age_2',
       'Age_3', 'Age_4'],
      dtype='object')

## Train

In [200]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score

In [201]:
# 数据筛选
train_x = encoded_features.iloc[:df_train.shape[0]].values
test_x = encoded_features.iloc[df_train.shape[0]:].values
train_y = df_all['Survived'].iloc[:df_train.shape[0]].values

In [202]:
# Define the model
model_LogisticRegression = LogisticRegression()

# Define the hyperparameters grid to search
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']  # Different solvers
}

# Define the K-fold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the GridSearchCV
grid_search = GridSearchCV(model_LogisticRegression, param_grid, cv=kf, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(train_x, train_y)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
test_y = best_model.predict(test_x).astype(int)



4921.25s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4921.44s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4921.62s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4921.86s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4922.01s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4922.42s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4922.61s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4922.79s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


4922.97s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4923.15s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4923.33s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4923.52s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
4923.70s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [203]:
# Create a DataFrame with the predictions
predictions_df = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': test_y})

# Save the predictions to a CSV file
output_file = "data/output/Predictions_LogisticRegression_Improved.csv"
predictions_df.to_csv(output_file, index=False)

print("Predictions saved to:", output_file)
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

Predictions saved to: data/output/Predictions_LogisticRegression_Improved.csv
Best Hyperparameters: {'C': 10, 'solver': 'newton-cg'}
Best Cross-Validation Score: 0.8215554579122466
