In [27]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the data
data = pd.read_csv('telecom_customer_churn.csv')  # Replace 'your_data.csv' with the actual file path

# Define numerical and categorical columns
numerical_cols = ['Age', 'Number of Dependents', 'Latitude', 'Longitude', 
                  'Number of Referrals', 'Tenure in Months', 'Avg Monthly Long Distance Charges',
                  'Avg Monthly GB Download', 'Monthly Charge', 'Total Charges', 
                  'Total Refunds', 'Total Extra Data Charges', 'Total Long Distance Charges', 
                  'Total Revenue']
categorical_cols = ['Gender', 'Married', 'City', 'Zip Code', 'Offer', 'Phone Service', 
                    'Multiple Lines', 'Internet Service', 'Internet Type', 'Online Security', 
                    'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 
                    'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 
                    'Contract', 'Paperless Billing', 'Payment Method']

# Define preprocessing steps
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Preprocess the data
X = data.drop(['Customer Status', 'Churn Category'], axis=1)
y = data['Customer Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = RandomForestClassifier(random_state=42)

# Create and evaluate the pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

# Fit the model
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8317955997161107
