# Setup

In [None]:
# This imports ***YOUR*** google drive.
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# 1) Download the clean_dataset.csv from quercus
# 2) Import it to your google drive
# 3) Move it to a folder called "csc311"
# 4) Run the above code, give permission
# 5) Run this code, it should give no errors if it can find the dataset
# Then you can access the dataset in code by "clean_dataset.csv"
![[ -f '/content/drive/MyDrive/csc311/clean_dataset.csv' ]] || echo "Couldn't find clean_dataset.csv"
![[ -f '/content/drive/MyDrive/csc311/clean_dataset.csv' ]] && cp '/content/drive/MyDrive/csc311/clean_dataset.csv' 'clean_dataset.csv'

In [None]:
# This is the starter code from challenge_basic.py on quercus
"""
This Python file provides some useful code for reading the training file
"clean_dataset.csv". You may adapt this code as you see fit. However,
keep in mind that the code provided does only basic feature transformations
to build a rudimentary kNN model in sklearn. Not all features are considered
in this code, and you should consider those features! Use this code
where appropriate, but don't stop here!
"""
from pprint import pprint
import re
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import numpy as np
import random

file_name = "clean_dataset.csv"
random_state = 42

def to_numeric(s):
    """Converts string `s` to a float.

    Invalid strings and NaN values will be converted to float('nan').
    """

    if isinstance(s, str):
        s = s.replace(",", '')
        s = pd.to_numeric(s, errors="coerce")
    return float(s)

def get_number_list(s):
    """Get a list of integers contained in string `s`
    """
    return [int(n) for n in re.findall("(\d+)", str(s))]

def get_number_list_clean(s):
    """Return a clean list of numbers contained in `s`.

    Additional cleaning includes removing numbers that are not of interest
    and standardizing return list size.
    """

    n_list = get_number_list(s)
    n_list += [-1]*(6-len(n_list))
    return n_list

def get_number(s):
    """Get the first number contained in string `s`.

    If `s` does not contain any numbers, return -1.
    """
    n_list = get_number_list(s)
    return n_list[0] if len(n_list) >= 1 else -1

def find_area_at_rank(l, i):
    """Return the area at a certain rank in list `l`.

    Areas are indexed starting at 1 as ordered in the survey.

    If area is not present in `l`, return -1.
    """
    return l.index(i) + 1 if i in l else -1

def cat_in_s(s, cat):
    """Return if a category is present in string `s` as an binary integer.
    """
    return int(cat in s) if not pd.isna(s) else 0


def make_bow(data, vocab):
    """
    """
    X = np.zeros([len(data), len(vocab)])
    mapping = {word: i for i, word in enumerate(vocab)}
    for i, review in enumerate(data):
        if review is np.nan:
          continue
        for word in review.split():
            if word in mapping.keys():
                X[i, mapping[word]] = 1
    return X



def readData(filename, vocab):
    '''
    Read <filename> into dataframes x (normalized) and y
    For training/computing weights, set vocab=None.
    For predicting, pass in vocab.
    '''
    df = pd.read_csv(filename)
    # scatter plots
    questions = ['Q1', 'Q2', 'Q3', 'Q4']
    #for f1 in questions:
    for _ in []:
        f2 = 'Label'
        plt.figure()
        plt.scatter(df[f1], df[f2], alpha=0.1)
        plt.xlabel(f1)
        plt.ylabel(f2)
        plt.title(f"{f1} vs {f2}")
        plt.axis('scaled')
    #explore the data by blox plots
    #des = df.describe()
    #print(des)
    #for fet in ["Q1", "Q2", "Q3",
            #"Q4", "Q8"]:
      #df.boxplot(column=fet, by='Label')

    # Clean numerics

    df["Q7"] = df["Q7"].apply(to_numeric).fillna(0) # fill empty with 0, numeric temperature
    df["Q8"] = df["Q8"].apply(to_numeric).fillna(0)
    df["Q9"] = df["Q9"].apply(to_numeric).fillna(0)
    questions = ['Q7', 'Q8', 'Q9']
    #for f1 in questions:
    for _ in []:
        f2 = 'Label'
        plt.figure()
        plt.scatter(df[f1], df[f2], alpha=0.1)
        plt.xlabel(f1)
        plt.ylabel(f2)
        plt.title(f"{f1} vs {f2}")

    # Clean for number categories

    df["Q1"] = df["Q1"].apply(get_number)
    df["Q2"] = df["Q2"].apply(get_number)
    df["Q3"] = df["Q3"].apply(get_number)
    df["Q4"] = df["Q4"].apply(get_number)

    # Create area rank categories

    df["Q6"] = df["Q6"].apply(get_number_list_clean) # extract all numbers in the Q6 answer string (skyscrapers=>6)
    #print(df['Q6'])

    temp_names = []
    for i in range(1,7):
        col_name = f"rank_{i}"
        temp_names.append(col_name)
        df[col_name] = df["Q6"].apply(lambda l: find_area_at_rank(l, i))

    del df["Q6"]

    # Create category indicators

    new_names = []
    for col in ["Q1", "Q2", "Q3", "Q4"] + temp_names:
        indicators = pd.get_dummies(df[col], prefix=col)
        new_names.extend(indicators.columns)
        df = pd.concat([df, indicators], axis=1)
        del df[col]
        for i in range(-1,7):
          newname=col+'_'+str(i)
          if not newname in df.keys():
            df[newname]=0 # initialize columns
            new_names.append(newname)

    # Create multi-category indicators

    for cat in ["Partner", "Friends", "Siblings", "Co-worker"]:
      cat_name = f"Q5{cat}"
      new_names.append(cat_name)
      df[cat_name] = df["Q5"].apply(lambda s: cat_in_s(s, cat))

    del df["Q5"]

    # bags of words for Q10
    df['Q10'] = df['Q10'].str.lower()
    if vocab==None:
      df['Q10'].fillna('', inplace=True)# replace all missing text in Q10 by ''
      vocab = []
      for i, line in enumerate(df['Q10']):
        line = re.sub(r'\W+', ' ', line)
        words = line.split()
        for word in words:
          if word not in vocab:
            vocab.append(word)

      with open('/content/drive/My Drive/csc311/vocab.txt', 'w') as file:
          for word in vocab:
              file.write(word + '\n')

    Q10_matrix = make_bow(df['Q10'], vocab)
    dfq10 = pd.DataFrame(Q10_matrix, columns=vocab)

    df = df.sample(frac=1, random_state=random_state) # permute the rows randomly

    if "Label" in df.keys():
      y = pd.get_dummies(df["Label"].values)
    else:
      y = [] # shouldn't be used by predict_all anyways
    df = df[new_names + ["Q7", "Q8", "Q9"]]

    df = pd.concat([df, dfq10], axis=1) #integrate df and df3

    x = df
    x = np.array(x,dtype=float)

    # Normalize

    def normalize(col):
      idx=list(df.keys()).index(col)

      vals=x[:, idx]
      mean = vals.mean()
      std = vals.std()
      if std==0:
        std=1e-10
      x[:, idx] = (vals - mean) / std
    normalize("Q7")
    normalize("Q8")
    normalize("Q9")

    return x,y

if __name__ == "__main__":
  x,y=readData(file_name,None)

  # test-train split
  n_train = 1467

  x_train = x[:n_train]
  y_train = y[:n_train]

  x_test = x[n_train:]
  y_test = y[n_train:]

In [None]:
from sklearn.tree import DecisionTreeClassifier

d = 100
s = 100

crit = ["gini", "entropy", "log_loss"]
best_params = [0, -1, -1] # [crit, mdepth, mi_split]
best_test_acc = 0

def_mdepth = None
def_mi_split = 2

for i in range(3):
    for j in range(1, d):
        clf = DecisionTreeClassifier(criterion = crit[i], max_depth=j)
        clf.fit(x_train, y_train)
        train_acc = clf.score(x_train, y_train)
        test_acc = clf.score(x_test, y_test)
        if best_test_acc < test_acc:
            best_test_acc = test_acc
            best_params[0] = crit[i]
            best_params[1] = j
            best_params[2] = def_mi_split
    if best_params[1] == -1:
        best_params[1] = def_mdepth

    for j in range(2, s):
        clf = DecisionTreeClassifier(criterion = crit[i], max_depth=best_params[1], min_samples_split = j)
        clf.fit(x_train, y_train)
        train_acc = clf.score(x_train, y_train)
        test_acc = clf.score(x_test, y_test)
        if best_test_acc < test_acc:
            best_test_acc = test_acc
            best_params[0] = crit[i]
            best_params[2] = j
    if best_params[1] == -1:
        best_params[1] = def_mdepth

print(f"Best parameters: {best_params}")
print(f"{type(clf).__name__} test acc: {best_test_acc}")

clf = DecisionTreeClassifier(criterion = best_params[0], max_depth=best_params[1], min_samples_split = best_params[2])
clf.fit(x_train, y_train)
train_acc = clf.score(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print(f"{type(clf).__name__} test acc: {test_acc}")


KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import RandomForestClassifier

e = 100
d = 100
s = 100

def_n_est = 42
def_mdepth = 13
def_mi_split = 2

crit = ["gini", "entropy", "log_loss"]
max_f = ["sqrt", "log2"]
best_params = [0, 0, -1, -1, -1] # [crit, max_f, n_est, m_depth, mi_split]
best_test_acc = 0

for i in range(3):
    for j in range(2):
        # Test for best n estimators first
        for k in range(89, 90):
            # if best_params[3] != 0 and best_params[4] != 0:
            #     clf = RandomForestClassifier(n_estimators = k, criterion = crit[i], max_depth = best_params[3], min_samples_split = best_params[4], max_features = max_f[j])
            # else:
            #     clf = RandomForestClassifier(n_estimators = k, criterion = crit[i], max_features = max_f[j])
            clf = RandomForestClassifier(n_estimators = k, criterion = crit[i], max_features = max_f[j])
            clf.fit(x_train, y_train)
            train_acc = clf.score(x_train, y_train)
            test_acc = clf.score(x_test, y_test)
            if best_test_acc < test_acc:
                best_test_acc = test_acc
                best_params[0] = crit[i]
                best_params[1] = max_f[j]
                best_params[2] = k
                best_params[3] = None
                best_params[4] = 2
        if best_params[2] == -1:
            best_params[2] = def_n_est

        # Test for best max depth
        for k in range(1, d + 1):
            # if best_params[2] != 0 and best_params[4] != 0:
            #     clf = RandomForestClassifier(n_estimators = best_params[2], criterion = crit[i], max_depth = k, min_samples_split = best_params[4], max_features = max_f[j])
            # else:
            #     clf = RandomForestClassifier(n_estimators = best_params[2], criterion = crit[i], max_depth = k, max_features = max_f[j])
            clf = RandomForestClassifier(n_estimators = best_params[2], criterion = crit[i], max_depth = k, max_features = max_f[j])
            clf.fit(x_train, y_train)
            train_acc = clf.score(x_train, y_train)
            test_acc = clf.score(x_test, y_test)
            if best_test_acc < test_acc:
                best_test_acc = test_acc
                best_params[0] = crit[i]
                best_params[1] = max_f[j]
                best_params[3] = k
                best_params[4] = 2
        if best_params[3] == -1:
            best_params[3] = def_mdepth

        # Finally, test for best min_sample_split
        for k in range(2, s + 1):
            clf = RandomForestClassifier(n_estimators = best_params[2], criterion = crit[i], max_depth = best_params[3], min_samples_split = k, max_features = max_f[j])
            clf.fit(x_train, y_train)
            train_acc = clf.score(x_train, y_train)
            test_acc = clf.score(x_test, y_test)
            if best_test_acc < test_acc:
                best_test_acc = test_acc
                best_params[0] = crit[i]
                best_params[1] = max_f[j]
                best_params[4] = k
        if best_params[4] == -1:
            best_params[4] = def_mi_split

print(f"Best parameters: {best_params}")
print(f"{type(clf).__name__} test acc: {best_test_acc}")

clf = RandomForestClassifier(n_estimators = best_params[2], criterion = best_params[0], max_depth = best_params[3], min_samples_split = best_params[4], max_features = best_params[1])
clf.fit(x_train, y_train)
train_acc = clf.score(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print(f"{type(clf).__name__} test acc: {test_acc}")



KeyboardInterrupt: 

In [None]:
def dataframe_to_targets(y):
  '''
  From Pandas "DataFrame" object, return the set of labels
  A DataFrame is like a spreadsheet, a series of rows and columns. The labels are just the first row
  '''
  labels=y.keys()
  targets=['']*len(y.values)

  for r,row in enumerate(y.values):
    for i,x in enumerate(row):
      if x!=0:
        targets[r]=labels[i]
        break
  return targets

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from pprint import pprint
clf = LogisticRegression(max_iter=9000,fit_intercept=False, multi_class = 'multinomial', solver = 'newton-cg')

#y_train2 = (y_train.iloc[:, 1:] == 1).idxmax(1)
#y_test2 = (y_test.iloc[:, 1:] == 1).idxmax(1)

y_train2 = dataframe_to_targets(y_train)
y_test2 = dataframe_to_targets(y_test)

clf.fit(x_train,y_train2)
train_acc = clf.score(x_train, y_train2)
test_acc = clf.score(x_test, y_test2)
print(f"{type(clf).__name__} train acc: {train_acc}")
print(f"{type(clf).__name__} test acc: {test_acc}")

weights=np.array(clf.coef_, dtype=float)
with open('/content/drive/My Drive/csc311/weights.txt', 'w') as file:
    for row in weights:
        line = ' '.join(map(str, row))  # Convert each weight to string and join by spaces
        file.write(line + '\n')
predicted_labels = clf.predict(x_test)
f1 = f1_score(y_test2, predicted_labels, average='weighted')
print('f1 score', f1)

LogisticRegression train acc: 0.9652351738241309
LogisticRegression test acc: 1.0
f1 score 1.0


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# clf = MLPClassifier(random_state=0, max_iter=30, batch_size=20,learning_rate_init=0.001).fit(x_train, y_train)
# acc_train = clf.score(x_train, y_train)
# acc_test = clf.score(x_test, y_test)
# print("train acc", acc_train)
# print("test acc", acc_test)
def build_all_models_neural(batch_size,
                     max_iter,
                     learning_rate_init,
                     random_state=[None],
                     X_train=x_train,
                     t_train=y_train,
                     X_valid=x_test,
                     t_valid=y_test):
    """
    Parameters:
        `max_depths` - A list of values representing the max_depth values to be
                       try as hyperparameter values
        `min_samples_split` - An list of values representing the min_samples_split
                       values to try as hyperpareameter values
        `criterion` -  A string; either "entropy" or "gini"

    Returns a dictionary, `out`, whose keys are the the hyperparameter choices, and whose values are
    the training and validation accuracies (via the `score()` method).
    In other words, out[(max_depth, min_samples_split)]['val'] = validation score and
                    out[(max_depth, min_samples_split)]['train'] = training score
    For that combination of (max_depth, min_samples_split) hyperparameters.
    """
    out = {}

    for b in batch_size:
        for i in max_iter:
          for a in learning_rate_init:
            for j in random_state:
              out[(b, i , a, j)] = {}
              # Create a DecisionTreeClassifier based on the given hyperparameters and fit it to the data
              clf = MLPClassifier(random_state=j, max_iter=i, learning_rate_init=a, batch_size=b).fit(X_train, t_train)# TODO
              # TODO: store the validation and training scores in the `out` dictionary
              out[(b, i , a, j)]['test'] = clf.score(X_valid, t_valid)
              out[(b, i , a, j)]['train'] = clf.score(X_train, t_train)
    return out
batch_size = [80]
max_iter = [20]
learning_rate_init = [0.001]
best_score1 = 0
res = build_all_models_neural(batch_size, max_iter, learning_rate_init,X_train=x_train,
                     t_train=y_train,
                     X_valid=x_test,
                     t_valid=y_test)
for b, i, a, j in res:
      test_score = res[(b, i , a, j)]['test']
      if test_score > best_score1:
        best_score1 = test_score
        best_para1 = {'batch_size': b,
                     'max_iter': i,
                     'learning_rate_init': a,
                     }
print("Best parameters", best_para1, "Best score:", best_score1)
clf = MLPClassifier(random_state=0, max_iter=20, batch_size=80,learning_rate_init=0.001).fit(x_train, y_train)
acc_train = clf.score(x_train, y_train)
acc_test = clf.score(x_test, y_test)
print("train acc", acc_train)
print("test acc", acc_test)
predicted_labels = clf.predict(x_test)
f1 = f1_score(y_test, predicted_labels, average='weighted')
print('f1 socore', f1)



Best parameters {'batch_size': 80, 'max_iter': 20, 'learning_rate_init': 0.001} Best score: 0.891156462585034
train acc 0.9531516183986372
test acc 0.8843537414965986
f1 socore 0.9188097690519028




In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
y_pred = gnb.fit(x_train, dataframe_to_targets(y_train)).predict(x_test)
mislabelled = (dataframe_to_targets( y_test ) != y_pred).sum()
print("Number of mislabeled points out of a total", len(y_test),"points : ",  mislabelled, ' (acc ', 1-(mislabelled/len(y_test)), ')')
print(gnb.get_params(deep=True))

Number of mislabeled points out of a total 294 points :  103  (acc  0.6496598639455782 )
{'priors': None, 'var_smoothing': 1e-09}


In [None]:
def my_logistic_regression(x_data,weights):
    '''
    myu: see https://en.wikipedia.org/wiki/Logistic_regression
    s:   see https://en.wikipedia.org/wiki/Logistic_regression
    '''
    cities = np.array(['Dubai', 'New York City', 'Paris', 'Rio de Janeiro'])
    x_data=np.array(x_data,dtype=float)
    z = np.matmul(weights, x_data.T, dtype=float)

    m_vec = np.max(z, axis = 0) # Should be a (N, 1) Vector
    e_z = np.exp(z - m_vec)

    sum_e_z_vec = np.sum(e_z, axis = 0)

    y_temp = e_z / sum_e_z_vec # e_z is (N, K), sum_e_z is (n, 1)

    y = [cities[j] for j in np.argmax(y_temp, axis = 0)]

    return y


In [None]:
# This is the starter code from example_pred.py on quercus
"""
This Python file is example of how your `pred.py` script should
look. Your file should contain a function `predict_all` that takes
in the name of a CSV file, and returns a list of predictions.

Your `pred.py` script can use different methods to process the input
data, but the format of the input it takes and the output your script produces should be the same.

Here's an example of how your script may be used in our test file:

    from example_pred import predict_all
    predict_all("example_test_set.csv")
"""

# basic python imports are permitted
import sys
import csv
import random

# numpy and pandas are also permitted
import numpy as np
import pandas as pd

WEIGHTS_FILE='/content/drive/My Drive/csc311/weights.txt'
VOCAB_FILE='/content/drive/My Drive/csc311/vocab.txt'

def predict(x,weights):
    """
    Helper function to make prediction for a given input x.
    This code is here for demonstration purposes only.
    """
    # randomly choose between the four choices: 'Dubai', 'Rio de Janeiro', 'New York City' and 'Paris'.
    # NOTE: make sure to be *very* careful of the spelling/capitalization of the cities!!

    # return the prediction
    return my_logistic_regression(x,weights)

def random_predict(x):
    """
    Helper function to make prediction for a given input x.
    This code is here for demonstration purposes only.
    """
    # randomly choose between the four choices: 'Dubai', 'Rio de Janeiro', 'New York City' and 'Paris'.
    # NOTE: make sure to be *very* careful of the spelling/capitalization of the cities!!
    y = random.choice(['Dubai', 'Rio de Janeiro', 'New York City' ,'Paris'])

    # return the prediction
    return y

def load_weights_vocab(weights_filename, vocab_filename):
  '''
  Import weights and vocab from external files, to keep pred.py clean
  '''
  weights = []
  with open(weights_filename, 'r') as file:
      for line in file:
          row = list(map(float, line.split()))  # Split by space and convert back to floats
          weights.append(row)

  weights = np.array(weights, dtype=float)

  vocab = []
  with open(vocab_filename, 'r') as file:
    for line in file:
      vocab.append(line)


  return weights, vocab

def predict_all(filename):
    """
    Make predictions for the data in filename
    """
    weights, vocab = load_weights_vocab(WEIGHTS_FILE, VOCAB_FILE)
    x,y=readData(filename,vocab)

    return predict(x,weights)

def accuracy(t,y):
  '''
  predictions t, real targets y
  '''
  return 1-sum(np.array(t) != np.array(y))/len(y)

# test code
if __name__=='__main__':
  weights, vocab = load_weights_vocab(WEIGHTS_FILE, VOCAB_FILE)
  predictions = predict(x_test,weights)
  preds = predict_all(file_name)
  print('test',end=' ')
  pprint(accuracy(predictions,y_test2))
  y2 = dataframe_to_targets(y)
  print('entire file',end=' ')
  pprint(accuracy(preds,y2))

test 0.8877551020408163
entire file 0.8596730245231607
