<a href="https://colab.research.google.com/github/MohammedShabry/DL-Assignment/blob/malith/Random%20Forest%20Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/DL assignment/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Replace empty strings with NaN in 'TotalCharges' and convert to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Drop missing values
df.dropna(inplace=True)

# Convert 'Churn' to binary
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# Convert categorical variables using OneHotEncoding
cat_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

df = pd.get_dummies(df, columns=cat_features, drop_first=True)

# Feature scaling for numerical columns
scaler = StandardScaler()
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Apply scaling to the numerical columns
df[num_cols] = scaler.fit_transform(df[num_cols])

# Split the dataset into training and testing
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the first few rows of the transformed data
df.head()


Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,-1.280248,-1.161694,-0.994194,0,False,True,False,False,...,False,False,False,False,False,False,True,False,True,False
1,5575-GNVDE,0,0.064303,-0.260878,-0.17374,0,True,False,False,True,...,False,False,False,False,True,False,False,False,False,True
2,3668-QPYBK,0,-1.239504,-0.363923,-0.959649,1,True,False,False,True,...,False,False,False,False,False,False,True,False,False,True
3,7795-CFOCW,0,0.512486,-0.74785,-0.195248,0,True,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,9237-HQITU,0,-1.239504,0.196178,-0.940457,1,False,False,False,True,...,False,False,False,False,False,False,True,False,True,False


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/DL assignment/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Replace empty strings with NaN in 'TotalCharges' and convert to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Drop missing values
df.dropna(inplace=True)

# Drop the 'customerID' column as it is non-numeric and irrelevant
df.drop(columns=['customerID'], inplace=True)

# Convert 'Churn' to binary
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# Categorical features to encode
cat_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

# Convert categorical variables using OneHotEncoding
df = pd.get_dummies(df, columns=cat_features, drop_first=True)

# Feature scaling for numerical columns
scaler = StandardScaler()
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Apply scaling to the numerical columns
df[num_cols] = scaler.fit_transform(df[num_cols])

# Split the dataset into training and testing sets
X = df.drop('Churn', axis=1)
y = df['Churn']

# Verify that no non-numeric columns exist
non_numeric_columns = X.select_dtypes(include=['object']).columns
if len(non_numeric_columns) > 0:
    print(f"Non-numeric columns found: {non_numeric_columns}")
else:
    # Proceed with train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Logistic Regression
    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train, y_train)

    # Make predictions
    y_pred_lr = lr_model.predict(X_test)

    # Evaluate the model
    print(f"Accuracy: {accuracy_score(y_test, y_pred_lr)}")
    print(f"Precision: {precision_score(y_test, y_pred_lr)}")
    print(f"Recall: {recall_score(y_test, y_pred_lr)}")
    print(f"F1-Score: {f1_score(y_test, y_pred_lr)}")

    # Confusion Matrix
    print(confusion_matrix(y_test, y_pred_lr))


Accuracy: 0.7867803837953091
Precision: 0.6193548387096774
Recall: 0.5133689839572193
F1-Score: 0.5614035087719298
[[915 118]
 [182 192]]


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(f"Precision: {precision_score(y_test, y_pred_rf)}")
print(f"Recall: {recall_score(y_test, y_pred_rf)}")
print(f"F1-Score: {f1_score(y_test, y_pred_rf)}")

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))


Accuracy: 0.7846481876332623
Precision: 0.624561403508772
Recall: 0.47593582887700536
F1-Score: 0.5402124430955993
Confusion Matrix:
[[926 107]
 [196 178]]
