# Preprocessing dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
import pandas as pd

In [20]:
file_path = "/content/drive/MyDrive/MLCW/cleaned_dataset.csv"  # Replace with the actual file path
data = pd.read_csv(file_path)

In [21]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41174 entries, 0 to 41173
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41174 non-null  int64  
 1   job             41174 non-null  object 
 2   marital         41174 non-null  object 
 3   education       41174 non-null  object 
 4   housing         41174 non-null  object 
 5   loan            41174 non-null  object 
 6   contact         41174 non-null  object 
 7   month           41174 non-null  object 
 8   day_of_week     41174 non-null  object 
 9   campaign        41174 non-null  int64  
 10  pdays           41174 non-null  int64  
 11  previous        41174 non-null  int64  
 12  poutcome        41174 non-null  object 
 13  emp.var.rate    41174 non-null  float64
 14  cons.price.idx  41174 non-null  float64
 15  cons.conf.idx   41174 non-null  float64
 16  euribor3m       41174 non-null  float64
 17  nr.employed     41174 non-null 

In [22]:
# Identify numerical columns
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
print("Numerical Columns:", numerical_columns)

Numerical Columns: Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')


In [32]:
print(data[numerical_columns].describe())

                age      campaign         pdays      previous  emp.var.rate  \
count  41174.000000  41174.000000  41174.000000  41174.000000  41174.000000   
mean       0.284242      0.028508      0.963426      0.024717      0.725388   
std        0.128649      0.050370      0.187129      0.070711      0.327271   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.185185      0.000000      1.000000      0.000000      0.333333   
50%        0.259259      0.018182      1.000000      0.000000      0.937500   
75%        0.370370      0.036364      1.000000      0.000000      1.000000   
max        1.000000      1.000000      1.000000      1.000000      1.000000   

       cons.price.idx  cons.conf.idx     euribor3m   nr.employed  
count    41174.000000   41174.000000  41174.000000  41174.000000  
mean         0.535742       0.430834      0.677223      0.769122  
std          0.225584       0.193635      0.393211      0.273166  
min          0.00000

In [24]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply normalization to numerical columns
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Verify normalized data
print(data[numerical_columns].head())


        age  campaign  pdays  previous  emp.var.rate  cons.price.idx  \
0  0.481481       0.0    1.0       0.0        0.9375        0.698753   
1  0.493827       0.0    1.0       0.0        0.9375        0.698753   
2  0.246914       0.0    1.0       0.0        0.9375        0.698753   
3  0.283951       0.0    1.0       0.0        0.9375        0.698753   
4  0.481481       0.0    1.0       0.0        0.9375        0.698753   

   cons.conf.idx  euribor3m  nr.employed  
0        0.60251   0.957379     0.859735  
1        0.60251   0.957379     0.859735  
2        0.60251   0.957379     0.859735  
3        0.60251   0.957379     0.859735  
4        0.60251   0.957379     0.859735  


In [25]:
categorical_columns = data.select_dtypes(include=['object']).columns
print("Categorical Columns:", categorical_columns)

Categorical Columns: Index(['job', 'marital', 'education', 'housing', 'loan', 'contact', 'month',
       'day_of_week', 'poutcome', 'y'],
      dtype='object')


In [26]:
for col in categorical_columns:
    print(f"\nUnique values in '{col}':")
    print(data[col].value_counts())


Unique values in 'job':
job
admin.           10418
blue-collar       9252
technician        6739
services          3967
management        2924
retired           1718
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
other              330
Name: count, dtype: int64

Unique values in 'marital':
marital
married     24999
single      11564
divorced     4611
Name: count, dtype: int64

Unique values in 'education':
education
university.degree      12163
high.school             9512
basic.9y                6045
professional.course     5240
basic.4y                4175
basic.6y                2291
other                   1730
illiterate                18
Name: count, dtype: int64

Unique values in 'housing':
housing
yes    22560
no     18614
Name: count, dtype: int64

Unique values in 'loan':
loan
no     34926
yes     6248
Name: count, dtype: int64

Unique values in 'contact':
contact
cellular     26134
telephone    15040
Name: 

In [27]:
# Define the order for education levels
education_order = {
    "illiterate": 0,
    "basic.4y": 1,
    "basic.6y": 2,
    "basic.9y": 3,
    "high.school": 4,
    "professional.course": 5,
    "university.degree": 6,
    "other": 7
}

In [28]:
# Apply label encoding for 'education'
data['education'] = data['education'].map(education_order)
print(data['education'].head())

0    1
1    4
2    4
3    2
4    4
Name: education, dtype: int64


In [29]:
# Apply One-Hot Encoding to nominal columns
nominal_columns = ['job', 'marital', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
data = pd.get_dummies(data, columns=nominal_columns, drop_first=True)

In [30]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41174 entries, 0 to 41173
Data columns (total 42 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   41174 non-null  float64
 1   education             41174 non-null  int64  
 2   campaign              41174 non-null  float64
 3   pdays                 41174 non-null  float64
 4   previous              41174 non-null  float64
 5   emp.var.rate          41174 non-null  float64
 6   cons.price.idx        41174 non-null  float64
 7   cons.conf.idx         41174 non-null  float64
 8   euribor3m             41174 non-null  float64
 9   nr.employed           41174 non-null  float64
 10  y                     41174 non-null  object 
 11  job_blue-collar       41174 non-null  bool   
 12  job_entrepreneur      41174 non-null  bool   
 13  job_housemaid         41174 non-null  bool   
 14  job_management        41174 non-null  bool   
 15  job_other          

In [33]:
print(data.head())

        age  education  campaign  pdays  previous  emp.var.rate  \
0  0.481481          1       0.0    1.0       0.0        0.9375   
1  0.493827          4       0.0    1.0       0.0        0.9375   
2  0.246914          4       0.0    1.0       0.0        0.9375   
3  0.283951          2       0.0    1.0       0.0        0.9375   
4  0.481481          4       0.0    1.0       0.0        0.9375   

   cons.price.idx  cons.conf.idx  euribor3m  nr.employed  ... month_may  \
0        0.698753        0.60251   0.957379     0.859735  ...      True   
1        0.698753        0.60251   0.957379     0.859735  ...      True   
2        0.698753        0.60251   0.957379     0.859735  ...      True   
3        0.698753        0.60251   0.957379     0.859735  ...      True   
4        0.698753        0.60251   0.957379     0.859735  ...      True   

   month_nov  month_oct  month_sep  day_of_week_mon  day_of_week_thu  \
0      False      False      False             True            False   
1 

# Split Dataset

In [34]:
# Separate features and target
X = data.drop('y', axis=1)  # Drop the target column from features
y = data['y']              # Target variable

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # 20% test, stratify ensures class balance
)

# Display the sizes of the datasets
print("Training set size (X_train):", X_train.shape)
print("Test set size (X_test):", X_test.shape)
print("Training set size (y_train):", y_train.shape)
print("Test set size (y_test):", y_test.shape)

Training set size (X_train): (32939, 41)
Test set size (X_test): (8235, 41)
Training set size (y_train): (32939,)
Test set size (y_test): (8235,)


In [37]:
# Check distribution in training set
print("Training set target distribution:")
print(y_train.value_counts(normalize=True))

# Check distribution in test set
print("Test set target distribution:")
print(y_test.value_counts(normalize=True))


Training set target distribution:
y
no     0.887337
yes    0.112663
Name: proportion, dtype: float64
Test set target distribution:
y
no     0.88731
yes    0.11269
Name: proportion, dtype: float64


In [43]:
print(y_train)
print(y_test)

1572      no
28767     no
25931    yes
27015     no
11758     no
        ... 
22252     no
37081    yes
8293      no
32680     no
33049     no
Name: y, Length: 32939, dtype: object
12462    no
11184    no
29694    no
28919    no
6872     no
         ..
35300    no
29537    no
1802     no
84       no
11146    no
Name: y, Length: 8235, dtype: object


In [44]:
data.head()

Unnamed: 0,age,education,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,0.481481,1,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,...,True,False,False,False,True,False,False,False,True,False
1,0.493827,4,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,...,True,False,False,False,True,False,False,False,True,False
2,0.246914,4,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,...,True,False,False,False,True,False,False,False,True,False
3,0.283951,2,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,...,True,False,False,False,True,False,False,False,True,False
4,0.481481,4,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,...,True,False,False,False,True,False,False,False,True,False


In [45]:
# Convert target variable to numeric
y_train = y_train.map({'yes': 1, 'no': 0})
y_test = y_test.map({'yes': 1, 'no': 0})

# Verify the conversion
print(y_train.value_counts())
print(y_test.value_counts())


y
0    29228
1     3711
Name: count, dtype: int64
y
0    7307
1     928
Name: count, dtype: int64


In [46]:
# Check for missing values
print("Missing values in X_train:", X_train.isnull().sum().sum())
print("Missing values in X_test:", X_test.isnull().sum().sum())

# Check the shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

Missing values in X_train: 0
Missing values in X_test: 0
X_train shape: (32939, 41)
X_test shape: (8235, 41)


In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [48]:
# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)


In [49]:
# Train the Random Forest model
rf_model.fit(X_train, y_train)


In [54]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8933819064966606

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94      7307
           1       0.55      0.30      0.39       928

    accuracy                           0.89      8235
   macro avg       0.73      0.63      0.66      8235
weighted avg       0.87      0.89      0.88      8235


Confusion Matrix:
 [[7082  225]
 [ 653  275]]
