In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Step 1: Data Loading and Exploration

In [2]:

train_data = pd.read_csv("/kaggle/input/spam-emails12345/train.csv")
test_data = pd.read_csv("/kaggle/input/spam-emails12345/test.csv")

In [3]:
# Display basic information about the datasets
print("Training Data Shape:", train_data.shape)
print("Test Data Shape:", test_data.shape)
print("\nTraining Data Columns:", train_data.columns)

Training Data Shape: (15000, 14)
Test Data Shape: (10000, 13)

Training Data Columns: Index(['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender',
       'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')


In [4]:
# Display first few rows of the training dataset
print(' \nFirst few rows of the training dataset: ')
train_data.head()

 
First few rows of the training dataset: 


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15715707.0,P'eng,685.0,France,Female,31.0,4.0,152582.2,1.0,1.0,1.0,130795.52,0.0
1,1,15651022.0,Ch'ien,590.0,France,Male,43.0,2.0,0.0,2.0,1.0,0.0,96929.24,0.0
2,2,15589969.0,Yeh,588.0,France,Male,31.0,10.0,0.0,2.0,1.0,1.0,177896.92,0.0
3,3,15618661.0,Ch'ien,593.0,Spain,Male,37.0,9.0,0.0,2.0,1.0,0.0,53817.23,0.0
4,4,15771580.0,Nweke,730.0,France,Female,52.0,8.0,0.0,1.0,1.0,0.0,1276.87,1.0


# Step 2: Data Preprocessing

In [5]:
# Drop the 'Surname' column
train_data = train_data.drop(columns=['Surname'])
test_data = test_data.drop(columns=['Surname'])


In [6]:
# Perform one-hot encoding for categorical variables 'Geography' and 'Gender'
train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)


In [7]:
# Separate features and target variable
X = train_data.drop(columns=['Exited'])
y = train_data['Exited']

In [8]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Model Selection and Training

In [9]:
# Define a pipeline for preprocessing and model training
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [10]:
# Train the model
pipeline.fit(X_train, y_train)

# Step 5: Model Evaluation

In [11]:
# Make predictions on the validation set
val_predictions = pipeline.predict(X_val)

In [12]:
# Evaluate model performance
print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_val, val_predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, val_predictions))
print("\nClassification Report:")
print(classification_report(y_val, val_predictions))


Model Evaluation:
Accuracy: 0.8933333333333333

Confusion Matrix:
[[2295   87]
 [ 233  385]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.91      0.96      0.93      2382
         1.0       0.82      0.62      0.71       618

    accuracy                           0.89      3000
   macro avg       0.86      0.79      0.82      3000
weighted avg       0.89      0.89      0.89      3000



# Step 6: Predictions

In [13]:
# Make predictions on the test dataset
test_predictions = pipeline.predict_proba(test_data)[:, 1]

# Step 7: Submission

In [14]:
# Prepare submission file
submission_df = pd.DataFrame({'id': test_data['id'], 'spam': test_predictions})
submission_df.to_csv('submission2.csv', index=False)