In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  # as distance based algorithem is there
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neural_network import MLPClassifier

In [14]:
df=pd.read_csv('balanced_trainingdata.csv')
df.head()

Unnamed: 0,category,main_promotion,color,stars,success_indicator
0,5,0,3,1.0,0
1,1,1,7,0.0,0
2,3,0,7,1.0,1
3,2,1,1,1.0,0
4,1,1,3,1.0,1


# Pipeline

In [16]:
class ANNClassifier:
    def __init__(self):
        self.pipeline = None
        self.label_encoder1 = None
        self.label_encoder2 = None
        self.label_encoder3 = None
        self.scaler = None
        self.x_train = None
        self.x_test = None
        self.y_encoded_train = None
        self.y_encoded_test = None

    def load(self, file_path):
        self.data = pd.read_csv(file_path)

    def preprocess(self):
        x = self.data.drop(['success_indicator', 'item_no'], axis=1)
        y = self.data['success_indicator']

        self.label_encoder1 = LabelEncoder()
        x['category_encoded'] = self.label_encoder1.fit_transform(x['category'])
        x.drop('category', axis=1, inplace=True)

        self.label_encoder2 = LabelEncoder()
        x['main_promotion_encoded'] = self.label_encoder2.fit_transform(x['main_promotion'])
        x.drop('main_promotion', axis=1, inplace=True)

        self.label_encoder3 = LabelEncoder()
        x['color_encoded'] = self.label_encoder3.fit_transform(x['color'])
        x.drop('color', axis=1, inplace=True)

        x['stars'] = np.where(x['stars'] <= 3, 0, x['stars'])
        x['stars'] = np.where(x['stars'] > 3, 1, x['stars'])

        label_encoder4 = LabelEncoder()
        y_encoded = label_encoder4.fit_transform(y)
        y_encoded = np.where(y_encoded == label_encoder4.classes_.tolist().index('flop'), 0, y_encoded)
        y_encoded = np.where(y_encoded == label_encoder4.classes_.tolist().index('top'), 1, y_encoded)

        self.scaler = StandardScaler()
        x_train = self.scaler.fit_transform(x)

        self.x_train, self.x_test, self.y_encoded_train, self.y_encoded_test = train_test_split(x_train, y_encoded,
                                                                                                test_size=0.2,
                                                                                                random_state=77)

    def create_model(self):
        model = MLPClassifier(max_iter=1000)
        return model

    def fit(self):
        self.pipeline = Pipeline([('scaler', StandardScaler()), ('classifier', self.create_model())])
        self.pipeline.fit(self.x_train, self.y_encoded_train)

    def predict(self):
        return self.pipeline.predict(self.x_test)

    def evaluate(self):
        y_pred = self.predict()
        accuracy = accuracy_score(self.y_encoded_test, y_pred)
        precision = precision_score(self.y_encoded_test, y_pred)
        recall = recall_score(self.y_encoded_test, y_pred)
        f1 = f1_score(self.y_encoded_test, y_pred)

        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 Score:", f1)

    def tune(self):
        param_grids = [
            {'classifier': [RandomForestClassifier()],
             'classifier__n_estimators': [100, 200, 300]},
            {'classifier': [MLPClassifier()],
             'classifier__hidden_layer_sizes': [(100,), (50, 50), (25, 25, 25)],
             'classifier__activation': ['relu', 'tanh'],
             'classifier__solver': ['adam']}
        ]

        best_model = None
        best_score = 0

        for entity in param_grids:
            gs = GridSearchCV(self.pipeline, entity, cv=5, scoring='accuracy')
            gs.fit(self.x_train, self.y_encoded_train)

            y_pred = gs.predict(self.x_test)
            accuracy = accuracy_score(self.y_encoded_test, y_pred)
            precision = precision_score(self.y_encoded_test, y_pred)
            recall = recall_score(self.y_encoded_test, y_pred)
            f1 = f1_score(self.y_encoded_test, y_pred)

            print("Best Parameters:", gs.best_params_)
            print("Best Score:", gs.best_score_)
            print("Accuracy:", accuracy)
            print("Precision:", precision)
            print("Recall:", recall)
            print("F1 Score:", f1)
            print("----------------------------------------------------")

            if gs.best_score_ > best_score:
                best_model = gs.best_estimator_
                best_score = gs.best_score_

        print("Best Model:", best_model)
        print("Best Score:", best_score)

    def load_test_file(self, file_path):
        self.input_data = pd.read_csv(file_path)
        return self.input_data

    def test_data_preprocessor(self):
        self.input_data_processed = self.input_data.drop(['item_no'], axis=1)
        self.input_data_processed['category_encoded'] = self.label_encoder1.transform(self.input_data_processed['category'])
        self.input_data_processed.drop('category', axis=1, inplace=True)
        self.input_data_processed['main_promotion_encoded'] = self.label_encoder2.transform(self.input_data_processed['main_promotion'])
        self.input_data_processed.drop('main_promotion', axis=1, inplace=True)
        self.input_data_processed['color_encoded'] = self.label_encoder3.transform(self.input_data_processed['color'])
        self.input_data_processed.drop('color', axis=1, inplace=True)
        self.input_data_processed['stars'] = np.where(self.input_data_processed['stars'] <= 3, 0, self.input_data_processed['stars'])
        self.input_data_processed['stars'] = np.where(self.input_data_processed['stars'] > 3, 1, self.input_data_processed['stars'])
        self.input_data_processed = self.scaler.transform(self.input_data_processed)
        return self.input_data_processed

    def predict_for_test_data(self):
        output = self.pipeline.predict(self.input_data_processed)
        return output

pipeline = ANNClassifier()
pipeline.load('historic.csv')
pipeline.preprocess()
pipeline.create_model()
pipeline.fit()
pipeline.tune()

Best Parameters: {'classifier': RandomForestClassifier(n_estimators=300), 'classifier__n_estimators': 300}
Best Score: 0.840625
Accuracy: 0.840625
Precision: 0.8559708295350957
Recall: 0.9063706563706564
F1 Score: 0.880450070323488
----------------------------------------------------
Best Parameters: {'classifier': MLPClassifier(hidden_layer_sizes=(25, 25, 25)), 'classifier__activation': 'relu', 'classifier__hidden_layer_sizes': (25, 25, 25), 'classifier__solver': 'adam'}
Best Score: 0.83125
Accuracy: 0.83375
Precision: 0.8481012658227848
Recall: 0.9054054054054054
F1 Score: 0.8758169934640523
----------------------------------------------------
Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', RandomForestClassifier(n_estimators=300))])
Best Score: 0.840625


Random forest is a better classifier algorithm in comparison to ANN.

# Justification for choosing Random forest over other algorithms.


**Superior Accuracy**: Random Forest stands out with its high accuracy compared to other models, making it the top performer on the dataset.

**Ensemble Learning Principle**: The algorithm leverages ensemble learning by combining predictions from multiple decision trees, effectively reducing overfitting and improving predictive performance.

**Robustness to Outliers**: Unlike distance-based methods like logistic regression, Random Forest's rule-based approach ensures robustness against outliers, contributing to its stability and reliability.

**Ease of Interpretation**: The ensemble nature of Random Forest simplifies interpretation, allowing for straightforward understanding of model outputs and facilitating the tuning of hyperparameters for optimization.

**Flexibility and Adaptability**: Random Forest's ability to handle various types of data and its adaptability to different problem domains make it a versatile choice for a wide range of applications.






