In [3]:
import numpy as np
import pandas as pd

In [4]:
dataset = pd.read_csv("/content/training_dataset.csv")

# Data Cleaning

In [5]:
attribute_drop = ['mail_category','mail_type']
dataset.dropna(subset=attribute_drop, inplace=True)

In [6]:
attribute_fill_zero = ['unsubscribe_time','open_time', 'click_time']
for a in attribute_fill_zero:
    dataset[a] = dataset[a].fillna(0)

In [7]:
attribute_fill_median = ['last_online', 'hacker_timezone']
for a in attribute_fill_median:
    dataset[a].fillna(int(dataset[a].median()), inplace=True)

In [8]:
dataset.drop(['user_id', 'mail_id', 'open_time', 'click_time', 'unsubscribe_time', 'clicked', 'unsubscribed'], axis=1, inplace= True)
#Because these attributes are not present in Testing data

# Data Preprocessing

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
le_mail_type = LabelEncoder()
le_mail_category = LabelEncoder()
le_click = LabelEncoder()
le_opened = LabelEncoder()
le_unsubscribed = LabelEncoder()
le_hacker_confirmation = LabelEncoder()

In [11]:
dataset['mail_category'] = le_mail_category.fit_transform(dataset['mail_category'])
dataset['mail_type'] = le_mail_type.fit_transform(dataset['mail_type'])
# dataset["clicked"] = le_click.fit_transform(dataset["clicked"])
dataset["opened"] = le_opened.fit_transform(dataset["opened"])
# dataset['unsubscribed'] = le_unsubscribed.fit_transform(dataset['unsubscribed'])
dataset['hacker_confirmation'] = le_hacker_confirmation.fit_transform(dataset['hacker_confirmation'])

# **Feature Scaling**

In [12]:
from sklearn.preprocessing import MinMaxScaler

In [13]:
scaler_send_time = MinMaxScaler()
scaler_last_online = MinMaxScaler()
scaler_hacker_created = MinMaxScaler()
scaler_timezone = MinMaxScaler()
scaler_ipcount = MinMaxScaler()
scaler_ipcount_365 = MinMaxScaler()

In [14]:
scaler_send_time.fit(dataset[['sent_time']])
dataset['sent_time'] = scaler_send_time.transform(dataset[['sent_time']])

In [15]:
scaler_last_online.fit(dataset[['last_online']])
dataset['last_online'] = scaler_last_online.transform(dataset[['last_online']])

In [16]:
scaler_hacker_created.fit(dataset[['hacker_created_at']])
dataset['hacker_created_at'] = scaler_hacker_created.transform(dataset[['hacker_created_at']])

In [17]:
scaler_timezone.fit(dataset[['hacker_timezone']])
dataset['hacker_timezone'] = scaler_timezone.transform(dataset[['hacker_timezone']])

In [18]:
scaler_ipcount.fit(dataset[['ipn_count']])
dataset['ipn_count'] = scaler_ipcount.transform(dataset[['ipn_count']])

In [19]:
scaler_ipcount_365.fit(dataset[['ipn_count_365_days']])
dataset['ipn_count_365_days'] = scaler_ipcount_365.transform(dataset[['ipn_count_365_days']])

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [21]:
X = dataset.drop(["opened"], axis= 1)
Y = dataset["opened"]

In [22]:
model = RandomForestClassifier()
model.fit(X, Y)

# Test Data Task

In [23]:
test_dataset = pd.read_csv("/content/test_dataset.csv")

In [24]:
test_dataset.drop(['user_id', 'mail_id'], axis= 1, inplace= True)

In [25]:
attribute_fill_median_test = ["last_online", "hacker_timezone"]
for a in attribute_fill_median_test:
    test_dataset[a].fillna(test_dataset[a].median(), inplace=True)

In [26]:
attribute_fill_mode_test = ["mail_category", "mail_type"]
for a in attribute_fill_mode_test:
    test_dataset[a] = test_dataset[a].fillna(test_dataset[a].mode()[0])

# Test Data Preprocessing

In [27]:
le_hacker_confirmation_test = LabelEncoder()
le_mail_category_test = LabelEncoder()
le_mail_type_test = LabelEncoder()
test_dataset['hacker_confirmation'] = le_hacker_confirmation_test.fit_transform(test_dataset['hacker_confirmation'])
test_dataset['mail_category'] = le_mail_category_test.fit_transform(test_dataset['mail_category'])
test_dataset['mail_type'] = le_mail_type_test.fit_transform(test_dataset['mail_type'])

# Test Data Feature Scaling

In [28]:
scaler_send_time_test= MinMaxScaler()
scaler_last_online_test = MinMaxScaler()
scaler_hacker_created_test = MinMaxScaler()
scaler_timezone_test = MinMaxScaler()
scaler_ipcount_test = MinMaxScaler()
scaler_ipcount_365_test = MinMaxScaler()

In [29]:
scaler_send_time_test.fit(test_dataset[['sent_time']])
test_dataset['sent_time'] = scaler_send_time_test.transform(test_dataset[['sent_time']])

scaler_send_time_test.fit(test_dataset[['sent_time']])
test_dataset['sent_time'] = scaler_send_time_test.transform(test_dataset[['sent_time']])

scaler_send_time_test.fit(test_dataset[['sent_time']])
test_dataset['sent_time'] = scaler_send_time_test.transform(test_dataset[['sent_time']])

scaler_send_time_test.fit(test_dataset[['sent_time']])
test_dataset['sent_time'] = scaler_send_time_test.transform(test_dataset[['sent_time']])

scaler_send_time_test.fit(test_dataset[['sent_time']])
test_dataset['sent_time'] = scaler_send_time_test.transform(test_dataset[['sent_time']])

scaler_send_time_test.fit(test_dataset[['sent_time']])
test_dataset['sent_time'] = scaler_send_time_test.transform(test_dataset[['sent_time']])

# Predicting Value for Test data

In [30]:
result = []
for row in test_dataset.values:
    result.append(model.predict([row]))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


# Converting Array Values into CSV file

In [33]:
import csv
csv_file_path = "output.csv"

# Open the CSV file in write mode
with open(csv_file_path, mode='w', newline='') as csv_file:
    # Create a CSV writer object
    csv_writer = csv.writer(csv_file)

    # Write the data to the CSV file row by row
    for row in result:
        csv_writer.writerow(row)