## Import delle librerie

In [1]:
# Manipolazione dati
import pandas as pd
import numpy as np

# Visualizzazione (se necessaria per controlli)
import matplotlib.pyplot as plt
import seaborn as sns

# Modellazione
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

## Caricamento del dataset


In [2]:
df = pd.read_csv("../data/raw/telco_customer_churn.csv")

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## 1. Data Quality Checks

Prima di procedere con la preparazione dei dati per la modellazione, è necessario verificare la presenza di eventuali criticità tecniche, come valori mancanti o formati non coerenti.

In [3]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### Verifica duplicati e tipo di dato

In [4]:
df.duplicated().sum()

np.int64(0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [13]:
df['TotalCharges'].unique()[:100]

array(['29.85', '1889.5', '108.15', '1840.75', '151.65', '820.5',
       '1949.4', '301.9', '3046.05', '3487.95', '587.45', '326.8',
       '5681.1', '5036.3', '2686.05', '7895.15', '1022.95', '7382.25',
       '528.35', '1862.9', '39.65', '202.25', '20.15', '3505.1', '2970.3',
       '1530.6', '4749.15', '30.2', '6369.45', '1093.1', '6766.95',
       '181.65', '1874.45', '20.2', '45.25', '7251.7', '316.9', '3548.3',
       '3549.25', '1105.4', '475.7', '4872.35', '418.25', '4861.45',
       '981.45', '3906.7', '97', '144.15', '4217.8', '4254.1', '3838.75',
       '1426.4', '1752.65', '633.3', '4456.35', '1752.55', '6311.2',
       '7076.35', '894.3', '7853.7', '4707.1', '5450.7', '2962', '957.1',
       '857.25', '244.1', '3650.35', '2497.2', '930.9', '887.35', '49.05',
       '1090.65', '7099', '1424.6', '177.4', '6139.5', '2688.85',
       '482.25', '2111.3', '1216.6', '79.35', '565.35', '496.9', '4327.5',
       '973.35', '918.75', '2215.45', '1057', '927.1', '1009.25',
       '257

In [8]:
(df["TotalCharges"] == " ").sum()

np.int64(11)

In [14]:
pd.to_numeric(df["TotalCharges"], errors="coerce").isnull().sum()

np.int64(11)

In [15]:
df.loc[pd.to_numeric(df["TotalCharges"], errors="coerce").isnull(), "TotalCharges"].unique()

array([' '], dtype=object)