In [18]:
!pip install pandas
!pip install numpy
!pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py): started
  Building wheel for sklearn (setup.py): finished with status 'done'
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1320 sha256=95484fb69ba211f127ccb53d8629522eb334634e0fb95782ae11502ad9fcae69
  Stored in directory: c:\users\kaio_\appdata\local\pip\cache\wheels\46\ef\c3\157e41f5ee1372d1be90b09f74f82b10e391eaacca8f22d33e
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


In [1]:
import pandas as pd 
import numpy as np

#Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer

#Modeli
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

#Training & Metriche
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

In [2]:
raw_data = pd.read_csv('telcom_data.csv');

In [3]:
#Transformo la feature TotalCharges in float
raw_data['TotalCharges'] = pd.to_numeric(raw_data['TotalCharges'],errors='coerce')

In [4]:
#Tengo traccia dei valori mancanti di TotalCharges
cols_with_missing = [col for col in raw_data.columns if raw_data[col].isnull().any()]
raw_data_copy = raw_data.copy()
for col in cols_with_missing:
    raw_data_copy[col+'_was_missing'] = raw_data_copy[col].isnull()

In [5]:
#Scaling e Imputing dei valori mancanti attraverso un Column Transformer.
#Il Column Transformer applica una serie di transformazioni alle colonne scelte

#Uso lo scaling su TotalCharges cosicchè non abbia un 'peso' preponderante sulle altre feature
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler((0,100)))
])

preprocessor = ColumnTransformer(transformers=[('num', num_transformer,['TotalCharges'])],remainder='passthrough')
processed_data = pd.DataFrame(preprocessor.fit_transform(raw_data_copy))

In [6]:
#Il preprocessing fà perdere gli header alle colonne, li reiserisco
processed_data.columns = ['TotalCharges','Churn', 'customerID', 'gender', 'SeniorCitizen', 'Partner',
       'Dependents', 'tenure', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges_was_missing']

In [7]:
#Feature categoriche selezionate per il label encoding
cat_features = ['Churn','gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges_was_missing']

label_encoder = LabelEncoder()
for feature in cat_features:
    processed_data[feature] = label_encoder.fit_transform(processed_data[feature])

In [8]:
#Colonne inutili ai fini del training e da eliminare
cols_to_drop = ['customerID', 'Churn']

In [9]:
#Dati di training
X = processed_data.drop(cols_to_drop, axis=1)
X

Unnamed: 0,TotalCharges,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges_was_missing
0,15.2089,0,0,0,1,21,1,0,0,2,0,2,0,0,2,1,0,3,64.85,0
1,58.9736,0,0,0,0,54,1,2,1,0,2,0,0,2,2,2,1,0,97.2,0
2,0.053658,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,2,23.45,0
3,2.52885,1,0,0,0,4,1,0,1,0,0,0,0,0,0,0,1,2,70.2,0
4,26.2215,1,0,0,1,0,1,2,0,2,2,0,2,0,0,2,1,0,61.9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,0.879298,1,0,1,0,1,1,0,1,2,0,0,0,2,2,0,1,2,95,0
5630,25.15,0,0,1,1,23,1,2,0,2,2,2,2,2,2,2,1,1,91.1,0
5631,3.31468,1,0,1,1,12,1,0,2,1,1,1,1,1,1,0,1,2,21.15,0
5632,13.632,1,1,0,0,12,1,2,1,0,0,2,0,2,2,0,1,2,99.45,0


In [10]:
#Colonna target
y = pd.DataFrame(processed_data['Churn'])
y

Unnamed: 0,Churn
0,0
1,0
2,1
3,1
4,0
...,...
5629,1
5630,0
5631,0
5632,1


In [11]:
#Traning di due modelli: DecisionTree e RandomForest, entrambi con il metodo della 5Fold Cross-Validation 

In [12]:
#KNeighbor, training di 3 modelli diversi con numero di vicini crescente
neighbors = [3,5,10]
for n in  neighbors:
    k_neighbor = KNeighborsClassifier(n_neighbors=n, algorithm='kd_tree')
    cv_kn = cross_val_predict(k_neighbor, X, y.values.ravel(), cv=5)
    accuracy_kn = accuracy_score(y, cv_kn)
    print('{:d}NeighborClassifier --- Cross-validation score: {:.2f}'.format(n,accuracy_kn))

3NeighborClassifier --- Cross-validation score: 0.76
5NeighborClassifier --- Cross-validation score: 0.78
10NeighborClassifier --- Cross-validation score: 0.78


In [13]:
#Instanziazzione del RandomForestClassifier; utilizza una foresta di alberi di decisione (100 default), e con profondità massima 10
random_forest = RandomForestClassifier(random_state=42,max_depth=10)

In [14]:
cv_random_forest = cross_val_predict(random_forest, X,y.values.ravel(), cv=5, verbose=1)
accuracy_random_forest = accuracy_score(y, cv_random_forest)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.0s finished


In [15]:
print('RF Cross-validation score: {:.2f}'.format(accuracy_random_forest))

RF Cross-validation score: 0.80
