# Loading Vocabulary (All Features )

In [2]:
features = []
word_id = []

vocab = open("vocab.txt", mode = 'r')

for line in vocab.readlines()[:]:
    line = line.strip().split("\t")
    features.append(line[0])
    word_id.append(line[1])

for i in range(len(word_id)):
    word_id[i] = int(word_id[i])

    
# print(features)
# print(word_id)
assert(len(features) == len(word_id))

In [3]:
print(len(word_id))

2038


# Data Pre-processing - Term Frequence (TF)
Generalise the TF for each user to all features by using the mapping in the current dataset

All feature size is 2038

There is a correspoding TF to each wordID. If the wordID is not shown for a user, then the TF for the word is null, which is set to zero for covering the absence of the word in the all-feature list of 2038 for each user.

In [4]:
y_train_count = []
x_train_count = []

y_dev_count = []
x_dev_count = []

y_test_count = []
x_test_count = []

In [5]:
def load_data(filename, labels):
    data = []
    
    count_data = open (filename, mode = 'r')
    for line in count_data.readlines()[1:]:
        line = line.strip().split(",", 2)
        label = line[0]
        labels.append(label)
        numbers = line[2].replace('"', '').replace('[', '').replace(']','').replace('(','').replace(')','')
        data.append(numbers)
        
    return data

In [6]:
def format_numbers(dataset):
    numbers = []

    for data in dataset:
        nums = data.split(',')
        temp = []
        for num in nums:
            num = num.strip().replace('.0', '')
            num = int(num)
            temp.append(num)
        numbers.append(temp)
        
    return numbers

In [7]:
def generalize_count_to_all_features(dataset):
    tf = [] 

    for i in range(len(dataset)):
        mappings = {}
        temp = [0] * len(word_id)

        for j in range(len(dataset[i])):
            if (j % 2 == 0):
                mappings[dataset[i][j]] = dataset[i][j + 1]

        keys = list(mappings.keys())

        for k in range(len(temp)):
            if k in keys:
                temp[k] = mappings[k]
        tf.append(temp)
        
    return tf

In [8]:
# reorganize all count datasets

# train_count
train_count = load_data("train_count.csv", y_train_count)
formatted_numbers = format_numbers(train_count)
x_train_count = generalize_count_to_all_features(formatted_numbers)

# dev_count
dev_count = load_data("dev_count.csv", y_dev_count)
formatted_numbers = format_numbers(dev_count)
x_dev_count = generalize_count_to_all_features(formatted_numbers)

# tes_count
test_count = load_data("test_count.csv", y_test_count)
formatted_numbers = format_numbers(test_count)
x_test_count = generalize_count_to_all_features(formatted_numbers)

In [9]:
print(len(x_train_count))
print(len(x_dev_count))
print(len(x_test_count))

133795
11475
12018


In [10]:
assert len(x_train_count)==len(y_train_count)
assert len(x_dev_count)==len(y_dev_count)
assert len(x_test_count)==len(y_test_count)

# Training Model - Baseline (One-R)

Feature Selection - Chi2

In [14]:
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2

x2 = SelectKBest(chi2, k=1)

x2.fit_transform(x_train_count, y_train_count)
best_feature_x2 = x2.get_support(indices=True)

In [15]:
print(best_feature_x2)

[0]


Feature Selection - MI

In [16]:
from sklearn.feature_selection import mutual_info_classif

mi = SelectKBest(score_func=mutual_info_classif, k=1)

mi.fit_transform(x_train_count, y_train_count)
best_feature_mi = mi.get_support(indices=True)

In [17]:
print(best_feature_mi)

[946]


Apply oneR Model - dummy way

In [18]:
def create_feature_column(features):
    feature_column = []
    for i in range(len(features[0])):
        atts = []
        for f in features:
            atts.append(f[i])
        feature_column.append(atts)
        
    return feature_column

In [19]:
x_train_count_columns = create_feature_column(x_train_count)

Using best feature selected by Chi2

In [20]:
from collections import Counter
from sklearn.metrics import accuracy_score

target_label = []
for i in range(len(x_train_count_columns[0])):
    if (x_train_count_columns[0][i] != 0):
        target_label.append(y_train_count[i]) 
count_majority_label = Counter(target_label).most_common()[0][0]

In [21]:
y_oneR_count_chi2 = [count_majority_label] * len(y_dev_count)

In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_dev_count, y_oneR_count_chi2, target_names = ['MIDWEST', 'NORTHEAST', 'SOUTH', 'WEST']))

NameError: name 'y_oneR_count_chi2' is not defined

Using best feature selected by MI

In [24]:
target_label = []
for i in range(len(x_train_count_columns[946])):
    if (x_train_count_columns[946][i] != 0):
        target_label.append(y_train_count[i]) 
count_majority_label = Counter(target_label).most_common()[0][0]

In [25]:
y_oneR_count_mi = [count_majority_label] * len(y_dev_count)

In [26]:
print(classification_report(y_dev_count, y_oneR_count_mi, target_names = ['MIDWEST', 'NORTHEAST', 'SOUTH', 'WEST']))

              precision    recall  f1-score   support

     MIDWEST       0.00      0.00      0.00      1484
   NORTHEAST       0.37      1.00      0.54      4295
       SOUTH       0.00      0.00      0.00      4266
        WEST       0.00      0.00      0.00      1430

    accuracy                           0.37     11475
   macro avg       0.09      0.25      0.14     11475
weighted avg       0.14      0.37      0.20     11475



Apply oneR Model - decision tree

In [27]:
from sklearn.tree import DecisionTreeClassifier

dt1 = DecisionTreeClassifier(max_depth=1)
dt1.fit(x_train_count, y_train_count)
y_oneR_count = dt1.predict(x_dev_count)

In [28]:
print(classification_report(y_dev_count, y_oneR_count, target_names = ['MIDWEST', 'NORTHEAST', 'SOUTH', 'WEST']))

              precision    recall  f1-score   support

     MIDWEST       0.00      0.00      0.00      1484
   NORTHEAST       0.41      0.81      0.54      4295
       SOUTH       0.40      0.28      0.33      4266
        WEST       0.00      0.00      0.00      1430

    accuracy                           0.41     11475
   macro avg       0.20      0.27      0.22     11475
weighted avg       0.30      0.41      0.33     11475



  _warn_prf(average, modifier, msg_start, len(result))


# Training Model - Logistic Regression (LR)

In [11]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter=2000)

LR.fit(x_train_count, y_train_count)

y_LR_count = LR.predict(x_dev_count)

In [14]:
print(classification_report(y_dev_count, y_LR_count, target_names = ['MIDWEST', 'NORTHEAST', 'SOUTH', 'WEST']))

              precision    recall  f1-score   support

     MIDWEST       0.24      0.02      0.03      1484
   NORTHEAST       0.50      0.63      0.56      4295
       SOUTH       0.43      0.58      0.50      4266
        WEST       0.20      0.03      0.05      1430

    accuracy                           0.46     11475
   macro avg       0.34      0.31      0.28     11475
weighted avg       0.40      0.46      0.40     11475



In [15]:
print(len(LR.coef_))

4


In [16]:
y_predict_count = LR.predict(x_test_count)

In [17]:
print(len(y_predict_count))

12018


In [25]:
y_predict = list(y_predict_count)
print(y_predict)

['SOUTH', 'NORTHEAST', 'SOUTH', 'SOUTH', 'NORTHEAST', 'NORTHEAST', 'SOUTH', 'SOUTH', 'NORTHEAST', 'SOUTH', 'SOUTH', 'NORTHEAST', 'NORTHEAST', 'NORTHEAST', 'NORTHEAST', 'SOUTH', 'SOUTH', 'SOUTH', 'NORTHEAST', 'SOUTH', 'SOUTH', 'SOUTH', 'SOUTH', 'SOUTH', 'SOUTH', 'NORTHEAST', 'SOUTH', 'SOUTH', 'NORTHEAST', 'SOUTH', 'SOUTH', 'SOUTH', 'SOUTH', 'NORTHEAST', 'SOUTH', 'SOUTH', 'NORTHEAST', 'NORTHEAST', 'NORTHEAST', 'NORTHEAST', 'SOUTH', 'SOUTH', 'SOUTH', 'NORTHEAST', 'SOUTH', 'SOUTH', 'MIDWEST', 'SOUTH', 'NORTHEAST', 'NORTHEAST', 'NORTHEAST', 'SOUTH', 'SOUTH', 'SOUTH', 'SOUTH', 'SOUTH', 'SOUTH', 'SOUTH', 'SOUTH', 'NORTHEAST', 'SOUTH', 'NORTHEAST', 'NORTHEAST', 'SOUTH', 'SOUTH', 'NORTHEAST', 'SOUTH', 'WEST', 'SOUTH', 'SOUTH', 'SOUTH', 'NORTHEAST', 'NORTHEAST', 'SOUTH', 'SOUTH', 'SOUTH', 'SOUTH', 'NORTHEAST', 'SOUTH', 'NORTHEAST', 'NORTHEAST', 'NORTHEAST', 'SOUTH', 'SOUTH', 'NORTHEAST', 'MIDWEST', 'NORTHEAST', 'SOUTH', 'SOUTH', 'NORTHEAST', 'NORTHEAST', 'SOUTH', 'SOUTH', 'SOUTH', 'SOUTH', 'SOUT

In [27]:
import csv


# name of csv file
filename = "y_predict_count.csv"

id_count = 0

# create a file
f = open (filename, 'x')

# writing to csv file
with open(filename, 'w', newline='') as csvfile:
    fields = ['id', 'region']
    
    thewriter = csv.DictWriter(csvfile, fieldnames=fields)
    
    thewriter.writeheader()
    
    for i in range(len(y_predict)):
        id_count += 1
        thewriter.writerow({'id': id_count, 'region': y_predict[i]})

# Training Model - Multi-Layer Perceptron (MLP)

In [33]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter=2000)

mlp.fit(x_train_count, y_train_count)

y_mlp_count = mlp.predict(x_dev_count)

In [34]:
print(mlp.coefs_)
print(mlp.n_layers_)

[array([[-0.11574715,  0.25065723, -0.0864412 , ..., -0.35194764,
         0.09201767,  0.20204672],
       [-0.06320828, -0.04948128,  0.16775718, ..., -0.12952351,
        -0.31519508,  0.2802715 ],
       [-0.11539862, -0.50339257,  0.22717305, ...,  0.90837512,
        -0.70160645,  0.51814071],
       ...,
       [-0.08439192,  0.16945733, -0.40590394, ...,  0.31468836,
        -0.09941297,  0.65590632],
       [ 0.16930954, -0.3002652 , -0.27370342, ..., -0.03816459,
        -0.06483399, -0.34949345],
       [-0.0965506 , -0.41344271, -0.61403323, ..., -0.47975609,
        -0.18793058,  0.18611702]]), array([[-6.05344417e-01, -2.52096247e+00,  1.57251485e+00,
         1.71813914e+00],
       [ 1.96887665e+00,  1.32302992e+00,  9.96065448e-01,
        -6.26862375e+00],
       [ 5.99074914e-01,  2.61763217e+00, -4.70643313e+00,
         2.49363624e+00],
       [ 2.30432060e+00, -2.92237047e+00,  2.24231533e+00,
        -1.87586078e+00],
       [-2.35812948e+00,  1.80170437e+00, -2.

In [35]:
print(classification_report(y_dev_count, y_mlp_count, target_names = ['MIDWEST', 'NORTHEAST', 'SOUTH', 'WEST']))

              precision    recall  f1-score   support

     MIDWEST       0.13      0.11      0.12      1484
   NORTHEAST       0.46      0.49      0.47      4295
       SOUTH       0.40      0.40      0.40      4266
        WEST       0.14      0.13      0.13      1430

    accuracy                           0.36     11475
   macro avg       0.28      0.28      0.28     11475
weighted avg       0.35      0.36      0.36     11475



# Training Model - Random Forest (RF)

In [37]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(x_train_count, y_train_count)

y_rf_count = rf.predict(x_dev_count)

In [38]:
print(classification_report(y_dev_count, y_rf_count, target_names = ['MIDWEST', 'NORTHEAST', 'SOUTH', 'WEST']))
print("Accuracy: ", rf.score(x_dev_count, y_dev_count))

              precision    recall  f1-score   support

     MIDWEST       0.17      0.04      0.06      1484
   NORTHEAST       0.46      0.57      0.51      4295
       SOUTH       0.42      0.53      0.47      4266
        WEST       0.14      0.04      0.07      1430

    accuracy                           0.42     11475
   macro avg       0.30      0.29      0.28     11475
weighted avg       0.37      0.42      0.38     11475

Accuracy:  0.4203921568627451
