In [726]:
import numpy
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder

In [727]:
sns.set(rc={'figure.figsize': (11.7, 8.27)})

In [728]:
data_train = pd.read_csv("data/train.csv")
len(data_train)

5635

In [729]:
X, y = data_train.drop(columns="Churn"), data_train["Churn"]
categorical_factors = ["gender", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "PaperlessBilling", "PaymentMethod"]
numeric_factors = ['id', 'SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
X.head(5)

Unnamed: 0,id,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,1162,Male,0,No,Yes,1,No,No phone service,DSL,No,Yes,No,No,No,No,Yes,Mailed check,31.35,31.35
1,2143,Female,0,No,Yes,21,Yes,No,DSL,Yes,No,Yes,No,No,Yes,No,Mailed check,64.85,1336.8
2,1624,Female,0,No,No,54,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,Yes,Yes,Bank transfer (automatic),97.2,5129.45
3,6075,Male,0,Yes,No,1,No,No phone service,DSL,No,No,No,No,No,No,Yes,Electronic check,23.45,23.45
4,1363,Male,0,No,No,4,Yes,No,Fiber optic,No,No,No,No,No,No,Yes,Electronic check,70.2,237.95


In [730]:
for factor in numeric_factors:
    floated = X[X[factor] != ' '][factor].astype('float')
    X[factor] = X[factor].replace(' ', floated.median())
    X[factor] = X[factor].astype('float')

In [731]:
for factor in categorical_factors:
    not_empty = X[X[factor] != ' '][factor]
    X[factor] = X[factor].replace(' ', not_empty.mode()[0])


In [732]:
onehotencoder = OneHotEncoder()
X_transformed = onehotencoder.fit_transform(X[categorical_factors])
X_transformed = pd.DataFrame(X_transformed.toarray(), columns=onehotencoder.get_feature_names_out(categorical_factors))
X = X.drop(columns=categorical_factors)
X = X.join(X_transformed)
X.head(20)


Unnamed: 0,id,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1162.0,0.0,1.0,31.35,31.35,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,2143.0,0.0,21.0,64.85,1336.8,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1624.0,0.0,54.0,97.2,5129.45,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
3,6075.0,0.0,1.0,23.45,23.45,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1363.0,0.0,4.0,70.2,237.95,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5,6755.0,0.0,0.0,61.9,1410.25,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
6,1213.0,0.0,7.0,69.55,521.35,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,2723.0,0.0,32.0,18.95,613.95,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
8,4007.0,0.0,72.0,19.8,1414.65,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
9,6792.0,0.0,19.0,39.65,733.35,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [733]:
y = y.replace('No', 0)
y = y.replace('Yes', 1)
y = y.astype('int')

  y = y.replace('Yes', 1)


In [734]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1000)

In [735]:
model = RandomForestClassifier(n_estimators=1000,
                                                  min_samples_leaf=1,
                                                  max_features=int(len(list(X_train.columns)) ** (1/2)),
                                                  criterion='gini')

In [736]:
model.fit(X_train, y_train)
model.score(X_train, y_train)

1.0

In [737]:
f1_score(y_test, model.predict(X_test), average='weighted')

0.7863509766054434

In [738]:
data_test = pd.read_csv('data/test.csv')

In [739]:
for factor in numeric_factors:
    floated = data_test[data_test[factor] != ' '][factor].astype('float')
    data_test[factor] = data_test[factor].replace(' ', floated.median())
    data_test[factor] = data_test[factor].astype('float')

In [740]:
for factor in categorical_factors:
    not_empty = data_test[data_test[factor] != ' '][factor]
    data_test[factor] = data_test[factor].replace(' ', not_empty.mode()[0])

In [741]:
data_test_transformed = onehotencoder.fit_transform(data_test[categorical_factors])
data_test_transformed = pd.DataFrame(data_test_transformed.toarray(), columns=onehotencoder.get_feature_names_out(categorical_factors))
data_test = data_test.drop(columns=categorical_factors)
data_test = data_test.join(data_test_transformed)

In [742]:
data_test['Churn'] = model.predict(data_test)
data_test['Churn'] = data_test['Churn'].replace(0, 'No')
data_test['Churn'] = data_test['Churn'].replace(1, 'Yes')
data_test['Churn'] = data_test['Churn'].astype('str')
data_test['id'] = data_test['id'].astype('int')
submit = data_test[['id', 'Churn']]
submit.head()
submit.to_csv('data/submit.csv')