In [79]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import zscore

data= pd.read_csv("F:/Downloads/Telegram Desktop/Customer.csv")
print("First 5 rows of the dataset:")
print(data.head())
print(data.dtypes)
print("\nChecking for missing values:")
print(data.isnull().sum())
# Dropping the data if any
data=data.dropna()


First 5 rows of the dataset:
  Customer ID    Customer Name    Segment  Age        Country  \
0    CG-12520      Claire Gute   Consumer   67  United States   
1    DV-13045  Darrin Van Huff  Corporate   31  United States   
2    SO-20335   Sean O'Donnell   Consumer   65  United States   
3    BH-11710  Brosina Hoffman   Consumer   20  United States   
4    AA-10480     Andrew Allen   Consumer   50  United States   

              City           State  Postal Code Region  
0        Henderson        Kentucky        42420  South  
1      Los Angeles      California        90036   West  
2  Fort Lauderdale         Florida        33311  South  
3      Los Angeles      California        90032   West  
4          Concord  North Carolina        28027  South  
Customer ID      object
Customer Name    object
Segment          object
Age               int64
Country          object
City             object
State            object
Postal Code       int64
Region           object
dtype: object

Checkin

In [80]:
# Encode categorical variables
label_encoders={}
for column in data.select_dtypes(include=['object']).columns:
    le=LabelEncoder()
    data[column]=le.fit_transform(data[column])
    label_encoders[column]=le
print(label_encoders)
data.head()


{'Customer ID': LabelEncoder(), 'Customer Name': LabelEncoder(), 'Segment': LabelEncoder(), 'Country': LabelEncoder(), 'City': LabelEncoder(), 'State': LabelEncoder(), 'Region': LabelEncoder()}


Unnamed: 0,Customer ID,Customer Name,Segment,Age,Country,City,State,Postal Code,Region
0,143,166,0,67,0,90,14,42420,2
1,237,201,1,31,0,129,3,90036,3
2,705,687,0,65,0,71,8,33311,2
3,88,113,0,20,0,129,3,90032,3
4,2,31,0,50,0,42,28,28027,2


In [81]:
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Apply outlier removal to all numerical columns except the target
numerical_columns = data.select_dtypes(include=['int64']).columns
target_column = data.columns[-1]  # Replace this with the actual target column if incorrect
for col in numerical_columns:
    if col != target_column:
        data = remove_outliers_iqr(data, col)

In [82]:
# target_column=data.columns[-1]
X=data.drop(target_column,axis=1)
y=data[target_column]
# Implementing StandardScaler
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
print(X_scaled)

[[-1.10519345 -1.00472132 -0.87040189 ... -0.63731952 -0.41241998
  -0.40461225]
 [-0.69456822 -0.85182894  0.42702047 ... -0.06505866 -1.23134543
   1.09940349]
 [ 1.34982125  1.27119088 -0.87040189 ... -0.91611328 -0.85910659
  -0.69233233]
 ...
 [ 1.24498076  1.41971491 -0.87040189 ... -1.86988138 -0.71021105
   0.15080217]
 [ 1.05277321  1.00035296 -0.87040189 ... -0.40254584  0.70429654
  -0.34712509]
 [ 1.33234783  1.43718832 -0.87040189 ...  0.41916207  0.70429654
  -0.38455491]]


In [83]:
X_train, X_test, y_train, y_test= train_test_split(X_scaled, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
# Make predictions
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the RandomForestmodel:", accuracy)
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Accuracy of the RandomForestmodel: 0.9748427672955975

Confusion Matrix:
[[36  0  0  0]
 [ 0 31  0  0]
 [ 4  0 37  0]
 [ 0  0  0 51]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95        36
           1       1.00      1.00      1.00        31
           2       1.00      0.90      0.95        41
           3       1.00      1.00      1.00        51

    accuracy                           0.97       159
   macro avg       0.97      0.98      0.97       159
weighted avg       0.98      0.97      0.97       159

