## Assignment 3 - Competitive Assignment
An explanation this assignment could be found in the .pdf explanation document

In [1]:
# Insturctions
# ============


# ================================================================================================================
# After manually testing different models with different parameters, I have decided to use the MLPClassifier model.
# The MLPClassifier had the best results.
# ================================================================================================================


# =============================
# 1. Run Imports section.
# 2. Run Data Load section.
# 3. Run Data Cleansing section.
# 4. Run Optimization section - Optimization will take a few minutes to finish.
# 5. Run Final Model section.
# 6. Run Prediction section.
# 7. Run Output section.
# =============================


# =================================================================================================================
# Data Cleansing removes any non-hebrew character from the data.
# Optimization searches for the best parameters, testing each combination 5 iterations with shuffled data.
# A part of the Optimization was done manually, therefore some parameters are constant to make this section faster.
# Final Model section uses the best parameters found in the Optimization section.
# The best parameters are based on the average F1 Scores for each N iterations.
# The extra model training with the best parameters will be the one to predict the test data.
# =================================================================================================================

### Preceding Step - import modules (packages)
This step is necessary in order to use external modules (packages). <br/>

In [2]:
# Imports
# =======

import pandas as pd
import numpy as np
import os
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

### Reading input files
Reading input files for train annotated corpus (raw text data) corpus and for the test corpus

In [6]:
# Data Load
# =========

train_filename = '.' + os.sep + 'input' + os.sep + 'annotated_corpus_for_train.xlsx'
test_filename  = '.' + os.sep + 'input' + os.sep + 'corpus_for_test.xlsx'
df_train = pd.read_excel(train_filename, 'corpus', index_col=None, na_values=['NA'])
df_test  = pd.read_excel(test_filename,  'corpus', index_col=None, na_values=['NA'])

### Your implementation:
Write your code solution in the following code-cells

In [7]:
# Data Cleansing
# ==============

texts = 'story'
label = 'gender'

# Remove non-hebrew words & characters
df_train[texts] = df_train[texts].str.replace(r'[\W\da-zA-Z]+', ' ')
df_test[texts] = df_test[texts].str.replace(r'[\W\da-zA-Z]+', ' ')

In [8]:
# Optimization
# ============

# Parameters for best optimization
iter = 5
layers = [175, 176, 177]
random = [0, 1, 2]
params = []
params_current = []

f1_avg_max = 0

for l in layers:
    for r in random:
        f1_avg = 0
        f1_current_max = 0

        print()
        print("Start iterations")

        for i in range(0, iter):

            # Scramble the train set
            df_train = df_train.sample(frac=1)#.reset_index(drop=True)

            # Initialize & use CountVectorizer
            vectorizer = CountVectorizer()
            train_data = vectorizer.fit_transform(df_train[texts])

            # Vectorize the data 
            X = pd.DataFrame(train_data.toarray(), columns = vectorizer.get_feature_names())
            y = df_train["gender"]

            # Divide the data into train & validation
            training_portion = int(X.shape[0] * 0.7)
            X_train = X.iloc[0:training_portion]
            y_train = y.iloc[0:training_portion]
            X_validation = X.iloc[training_portion:]
            y_validation = y.iloc[training_portion:]

            # Train the model & Evaluate validation set
            clf = MLPClassifier(activation='identity', hidden_layer_sizes=l, random_state=r, tol=0.00001)
            clf.fit(X_train, y_train)
            pred = clf.predict(X_validation)

            # average = macro represents the average F1 Score between 'f' and 'm' labels
            f1 = f1_score(y_validation, pred, average='macro') 
            f1_avg += f1

            if f1 > f1_current_max:
                f1_current_max = f1
                params_current = [l, r]

        # Save the parameters based on the best results
        f1_avg = f1_avg / iter
        if f1_avg > f1_avg_max:
            f1_avg_max = f1_avg
            params = params_current

        print("End Iterations")
        print()
        print("Average F1_Score For Current Loop = ", f1_avg)
        print("Highest Average F1_Score So Far = ", f1_avg_max)
        print("Highest Average F1_Score Parameters:", params)


print()
print()
print("Optimiziation Finished")


Start iterations
End Iterations

Average F1_Score For Current Loop =  0.6949733992203353
Highest Average F1_Score So Far =  0.6949733992203353
Highest Average F1_Score Parameters: [175, 0]

Start iterations
End Iterations

Average F1_Score For Current Loop =  0.6897501294531422
Highest Average F1_Score So Far =  0.6949733992203353
Highest Average F1_Score Parameters: [175, 0]

Start iterations
End Iterations

Average F1_Score For Current Loop =  0.6963404033266418
Highest Average F1_Score So Far =  0.6963404033266418
Highest Average F1_Score Parameters: [175, 2]

Start iterations
End Iterations

Average F1_Score For Current Loop =  0.642320684655187
Highest Average F1_Score So Far =  0.6963404033266418
Highest Average F1_Score Parameters: [175, 2]

Start iterations
End Iterations

Average F1_Score For Current Loop =  0.6170319737299801
Highest Average F1_Score So Far =  0.6963404033266418
Highest Average F1_Score Parameters: [175, 2]

Start iterations
End Iterations

Average F1_Score 

In [27]:
# Final Model Training Using The Best Results Parameters
# ======================================================

iter = 1
f1_avg = 0

for i in range(0, iter):
    # Scramble the train set
    df_train = df_train.sample(frac=1)#.reset_index(drop=True)

    # Initialize CountVectorizer
    vectorizer = CountVectorizer()
    train_data = vectorizer.fit_transform(df_train[texts])

    # Vectorize the data 
    X = pd.DataFrame(train_data.toarray(), columns = vectorizer.get_feature_names())
    y = df_train["gender"]

    # Divide the data into train & validation
    training_portion = int(X.shape[0] * 0.7)
    X_train = X.iloc[0:training_portion]
    y_train = y.iloc[0:training_portion]
    X_validation = X.iloc[training_portion:]
    y_validation = y.iloc[training_portion:]

    # Train the model with the best parameters found at optimization & Evaluate validation set
    clf = MLPClassifier(activation='identity', hidden_layer_sizes=params[0], random_state=params[1], tol=0.00001, verbose=0)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_validation)
    
    
    # average = macro represents the average F1 Score between 'f' and 'm' labels
    print()
    print("Iteration ", i+1)
    f1 = f1_score(y_validation, pred, average='macro')
    f1_avg += f1
    print("Validation F1_Score:")
    print(f1)

f1_avg = f1_avg / iter
print("Average F1_Score Of All Iterations = ", f1_avg)


Iteration  1
Validation F1_Score:
0.6468699839486356
Average F1_Score Of All Iterations =  0.6468699839486356


In [31]:
# Prediction
# ==========

# Vectorize prediction data
test_data = vectorizer.transform(df_test[texts])
X_to_predict = pd.DataFrame(test_data.toarray(), columns = vectorizer.get_feature_names())

pred = clf.predict(X_to_predict)
df_predicted = pd.DataFrame({"test_example_id": range(0, pred.shape[0]), "predicted_category": pred})

### Save output to csv
After you're done save your output to the 'classification_results.csv' csv file.<br/>
We assume that the dataframe with your results contain the following columns:
* column 1 (left column): 'test_example_id'  - the same id associated to each of the test stories to be predicted.
* column 2 (right column): 'predicted_category' - the predicted gender value for each of the associated story. 

Assuming your predicted values are in the `df_predicted` dataframe, you should save you're results as following:

In [32]:
# Output
# ======

df_predicted.to_csv('classification_results.csv',index=False)