<a href="https://colab.research.google.com/github/Madelinelai/Kaggle/blob/main/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from urllib.request import urlretrieve
url = "https://github.com/Madelinelai/Kaggle/raw/main/titanic/train.csv"
urlretrieve(url, "train.csv")
url = "https://github.com/Madelinelai/Kaggle/raw/main/titanic/test.csv"
urlretrieve(url, "test.csv")

In [None]:
import pandas as pd
train_df = pd.read_csv("train.csv", encoding="utf-8")
test_df = pd.read_csv("test.csv", encoding="utf-8")

In [None]:
test_df

In [None]:
#資料預處理－缺失值填補
#step1　欄位分類　Cabin(船頭)，Embarked
#step2  PClass.Name(中間名), Sex
#step3  Age,SibSp,Parch,Ticket(算出同行有多少人), Fare

In [None]:
data = pd.concat([train_df, test_df], ignore_index=True)
data = data.drop(["PassengerId", "Survived"], axis=1)

In [None]:
# 此行是檢查資料裡是不是有N/A
na = data.isna().sum()
# Series[帶入根你的資料筆數一樣多True/False list]
na[na > 0].sort_values(ascending=False)

In [None]:
#補上最常出現的類別/補上中位數
#Cabin有空值，use apply
#觀念說明如下
#a = pd.Series([1,2,3])
#def func(n):
#  return n

In [None]:
#如果不是空值　就回傳 s / 是空值就會回傳N/A or None
def cabin_head(s):
    if not pd.isna(s):
        return s[0]
data["Cabin"] = data["Cabin"].apply(cabin_head)

In [None]:
#算出同行有多少人，換句話說有多少人一起分享同張票
dic = data["Ticket"].value_counts()
data["Ticket"] = data["Ticket"].apply(lambda t:dic[t])

In [None]:
#補缺失值，補最常出現（類別Embarked：最常出現缺失值
#如果一堆測試資料，不要重算，直接補s
most = data['Embarked'].value_counts().idxmax()
data['Embarked'] = data['Embarked'].fillna(most)
na = data.isna().sum()
#Series(帶入，根據你的資料筆數一樣多True/False list)
na[na>0].sort_values(ascending=False)

In [None]:
# 補缺失值(數值: 中位數)
med = data.median().drop(["Pclass"])
data = data.fillna(med)
na = data.isna().sum()
# Series[帶入根你的資料筆數一樣多True/False list]
na[na > 0].sort_values(ascending=False)

In [None]:
#錄製影片有詳細解說

def name_convert(s):
    s = s.split(",")[-1].split(".")[0]
    s = s.strip()
    return s
counts = data["Name"].apply(name_convert).value_counts()
whitelist = counts[counts > 50].index
def name_convert(s):
    s = s.split(",")[-1].split(".")[0]
    s = s.strip()
    if s in whitelist:
        return s
    else:
        return None
data["Name"] = data["Name"].apply(name_convert)

In [None]:
#再次檢視資料中其它欄位是否有需要ETL
#PClass屬大小類別123,Sex屬二值型，略過不必做
#整理完之後為２５個欄位，如下：
data = pd.get_dummies(data)
data = pd.get_dummies(data, columns=["Pclass"])
data

In [None]:
#新增一個family欄位，故合併 SibSp+Parch（兄弟姐妹父母）
#以上是預處理的欄位嘗試能提昇Model正確率,千萬不要任意的刪除資料中的欄位
data["Family"] = data["SibSp"] + data["Parch"]
data
#整理完之後為２６個欄位，如下：

In [None]:
print(len(data))
print(data.shape)

In [None]:
# .loc (根據列編號)(X) .iloc (根據第幾個)(O)
# .iloc [第一列, 第二列, 第三列...]
x_train = data.iloc[:train_df.shape[0]]
y_train = train_df["Survived"]
x_predict = data.iloc[train_df.shape[0]:]
# x_train

In [None]:
data

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
params = {
    # 1. 5 2. [1, 2, 3] 3. range
    # 20~99
    "n_estimators":range(20, 100),
    # 3~10
    "max_depth":range(3, 11)
}
clf = RandomForestClassifier()
cv = GridSearchCV(clf, params, cv=10, n_jobs=-1)
cv.fit(x_train, y_train)
print(cv.best_score_)
print(cv.best_params_)

In [None]:
#can look Scikit-learn/selection.cross_val_score
clf = RandomForestClassifier(n_estimators=25, max_depth=6)
scores = cross_val_score(clf, x_train, y_train, cv=10, n_jobs=-1)
print("10:", scores)
print("average:", np.average(scores))

# Use Tree look feauture_importances based on RandomForest

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
print(len(clf.estimators_))
plt.figure(figsize=(10, 10))
plot_tree(clf.estimators_[2], 
          feature_names=data.columns, 
          class_names=["Dead", "Alived"],
          max_depth=2,
          filled=True)

In [None]:
pd.DataFrame({
    "Name":data.columns,
    "Importance":clf.feature_importances_
}).sort_values(by="Importance", ascending=False)

In [None]:
#了解feature importances之後，預測分數後上傳至Kaggle
clf = RandomForestClassifier(n_estimators=97,max_depth=8)
clf.fit(x_train, y_train)
pre = clf.predict(x_predict)
df = pd.DataFrame({
    "PassengerId":test_df["PassengerId"],
    "Survived":pre
})
df.to_csv("rflai.csv",encoding="utf-8",index=False)
df

#嘗試使用KNN. (需要做Scaler), 將中心點定住

In [None]:

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
scaler = MinMaxScaler()
data_scale = scaler.fit_transform(data)
#再一次將Data轉為DataFrame
data_scale = pd.DataFrame(data_scale, columns=data.columns)
# .loc (根據列編號)(X) .iloc (根據第幾個)(O)
# .iloc [第一列, 第二列, 第三列...]
x_train_scale = data_scale.iloc[:train_df.shape[0]]
x_predict_scale = data_scale.iloc[train_df.shape[0]:]
x_train_scale

In [None]:
from sklearn.neighbors import KNeighborsClassifier
params = {
    "n_neighbors":range(3, 100)
}
clf = KNeighborsClassifier()
cv = GridSearchCV(clf, params, cv=10, n_jobs=-1)
cv.fit(x_train_scale, y_train)
print(cv.best_score_)
print(cv.best_params_)

In [None]:

clf = KNeighborsClassifier(n_neighbors=11)
clf.fit(x_train_scale, y_train)
pre = clf.predict(x_predict_scale)
df = pd.DataFrame({
    "PassengerId":test_df["PassengerId"],
    "Survived":pre
})
df.to_csv("knn.csv", encoding="utf-8", index=False)
df
#In kaggle I submission scored 0.80143

# 嘗試使用GradientBoostingClassifier.

In [None]:
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
combined = pd.concat([train.drop('Survived',axis=1),test])

In [None]:
combined.info()

In [None]:
import numpy as np
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train['Age'].fillna(train['Age'].median(),inplace=True) # Imputing Missing Age Values
train['Embarked'].fillna(train['Embarked'].value_counts().index[0], inplace=True) # Imputing Missing Embarked Values
d = {1:'1st',2:'2nd',3:'3rd'} #Creating a dictionary to convert Passenger Class from 1,2,3 to 1st,2nd,3rd.
train['Pclass'] = train['Pclass'].map(d) #Mapping the column based on the dictionary
train.drop(['PassengerId','Name','Ticket','Cabin'], 1, inplace=True) # Dropping Unnecessary Columns
categorical_vars = train[['Pclass','Sex','Embarked']] # Getting Dummies of Categorical Variables
dummies = pd.get_dummies(categorical_vars,drop_first=True)
train = train.drop(['Pclass','Sex','Embarked'],axis=1) #Dropping the Original Categorical Variables to avoid duplicates
train = pd.concat([train,dummies],axis=1) #Now, concat the new dummy variables
train.head() #Check the clean version of the train data.

In [None]:
# Splitting Features and Label
y = train['Survived']
X = train.drop(['Survived'],1)

#Using Train Test Split from Sklearn to Split Our Train Dataset into Train and Testing Datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(learning_rate=0.1,max_depth=3)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test,predictions))
#I get about 80% accuracy

In [None]:
test['Age'].fillna(test['Age'].median(),inplace=True) # Age
test['Fare'].fillna(test['Fare'].median(),inplace=True) # Fare
d = {1:'1st',2:'2nd',3:'3rd'} #Pclass
test['Pclass'] = test['Pclass'].map(d)
test['Embarked'].fillna(test['Embarked'].value_counts().index[0], inplace=True) # Embarked
ids = test[['PassengerId']]# Passenger Ids
test.drop(['PassengerId','Name','Ticket','Cabin'],1,inplace=True)# Drop Unnecessary Columns
categorical_vars = test[['Pclass','Sex','Embarked']]# Get Dummies of Categorical Variables
dummies = pd.get_dummies(categorical_vars,drop_first=True)
test = test.drop(['Pclass','Sex','Embarked'],axis=1)#Drop the Original Categorical Variables
test = pd.concat([test,dummies],axis=1)#Instead, concat the new dummy variables
#test.head()

In [None]:
preds = model.predict(test)
results = ids.assign(Survived=preds)
results.to_csv('titanic_submission.csv',index=False)
#In kaggle I submission scored 0.73684