# Loading Libraries

In [None]:
from google.colab import drive

# Preprocessing
import re
import os
import spacy
import pickle
import pandas as pd
from pandas import read_csv

# FastText
import fasttext
import fasttext.util

import random

# MEN and SimLex Benchmarks
from os import listdir
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity

# ElasticNet and ANN
import sklearn
from sklearn.model_selection import cross_val_score, RepeatedKFold, train_test_split, KFold, LeaveOneOut, StratifiedKFold
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error, median_absolute_error
from sklearn.linear_model import ElasticNetCV

import numpy as np
from numpy import mean
from numpy import std
from numpy import absolute
from numpy.random import seed

from scipy import stats

# PCA
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import time

from statistics import median

from tensorflow import keras
from tensorflow.random import set_seed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.layers import MaxPooling2D, MaxPooling1D, Conv2D, Conv1D, Bidirectional
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation

In [None]:
path = "drive/My Drive/Thesis/Data/CoCA/Text/"                                  ## These are the paths to easily export/import my dicts, txts, models, and pickles
dict_path = "drive/My Drive/Thesis/Data/CoCA/dict_pickles/"
unclean_path = path + "texts_combined/all_texts_combined.txt"
model_path = "drive/My Drive/Thesis/Data/CoCA/models/"
pickle_path = "drive/MyDrive/Thesis/Data/fastText and others/"
norms_path = "drive/My Drive/Thesis/Data/Norms/"
csv_path = "drive/My Drive/Thesis/Data/CSV/"
delta_path = "drive/My Drive/Thesis/Data/Deltacode/"

# Names

In [None]:
### Read CSV File and Delete Unimportant Columns (i.e., everything that isn't the name, name type, rating, or the author's choice)

### This is input for the FT model, which itself is the input for the ElasticNet and ANN regressions

names_ratings = read_csv("drive/MyDrive/Thesis/Data/giovanni_email_data/avgRatings_annotated.csv")

#print(names_ratings.head())

print(names_ratings['rating.mean_age'].notna().sum())                           ## Choosing only those rows where all columns are not NA
print(names_ratings['rating.mean_gender'].notna().sum())
print(names_ratings['rating.mean_valence'].notna().sum())

df_age = names_ratings.loc[names_ratings['rating.mean_age'].notna(), ['name', 'rating.mean_age', 'age', 'name_type']]   ## Choosing the relevant columns
print(df_age.head(), len(df_age))

df_gender = names_ratings.loc[names_ratings['rating.mean_gender'].notna(), ['name', 'rating.mean_gender', 'gender', 'name_type']]
print(df_gender.head(), len(df_gender))

df_polarity = names_ratings.loc[names_ratings['rating.mean_valence'].notna(), ['name', 'rating.mean_valence', 'polarity', 'name_type']]
print(df_polarity.head(), len(df_polarity))

# Functions

In [None]:
def fnn_maker(x_train, y_train, x_test, y_test, nodes, dropout):
  # Input:
  # - x_train = array of embeddings used to train the model
  # - y_train = array of ratings used to train the model
  # - x_test = array of embeddings used to test the model
  # - y_test = array of ratings used to test the model
  # - nodes = integer indicating the number of nodes to use in the hidden layer
  # - dropout = integer indicating the amount of dropout to use in the hidden layer

  # Process:
  # Train a sequential NN using the train set and return the model & test MSE

  # Output:
  # - mse = test set mean squared error
  # - fnn_model = trained neural network model

  random.seed(17042020)                                                         # Set the seed using python's built-in seed function
  set_seed(17042020)                                                            # Set the seed using keras/tensorflow's seed function, just to be sure

  fnn_model = Sequential()                                                      # Initialize a sequential NN

  fnn_model.add(Dense(nodes, input_dim=300, kernel_initializer=HeNormal(), 
                      activation=keras.layers.LeakyReLU()))                     # Add a dense layer with the specified nodes
  fnn_model.add(Dropout(dropout))

  fnn_model.add(Dense(1, activation='linear'))                                  # Add a final layer

  callback = EarlyStopping(monitor = 'loss', patience=3, verbose=0)             # Add early stopping that stops after 3 rounds without improvement

  fnn_model.compile(optimizer=Adam(), loss='mean_squared_error')                # Compile the model with mean squared loss

  fnn_model.fit(x_train, y_train, epochs=100, batch_size=len(x_train), 
                callbacks=[callback], verbose=0)                                # Fit the model on the train set

  return fnn_model

In [None]:
def model_evaluator(trained_model, x_train, x_test, y_train, y_test, pred_dict, 
                    test_names, test_name_types, lexical = None):
  
  #### Add predictions to the pred_dict
  if lexical is True:
    mean_vec_prediction = float(trained_model.predict(mean_vector_subwordless.reshape(1, -1), verbose=0))
  else:
    mean_vec_prediction = float(trained_model.predict(mean_vector.reshape(1, -1), verbose=0))

  for n, t, i, j in zip(test_names, test_name_types, y_test, x_test):           # For every name in the test set
      n = n.lower()                                                             # Convert name to lowercase
      pred_dict[n] = [t, i, float(trained_model.predict(j.reshape(1, -1), verbose=0))]     # Add predictions to the predictions dictionary
      pred_dict[n + '_mean_vector'] = [t, i, mean_vec_prediction]               # Add predictions to the mean_vector predictions dictionary

  
  ##############################################################################

  y_pred = trained_model.predict(x_test, verbose=0)

  mae_test = mean_absolute_error(y_test, y_pred)
  
  ##### MAE per name type ######################################################

  type_dict = {}
  type_counter = {}
  for n, i, j in zip(test_name_types, y_test, x_test):                          # For every name type (i.e., real, talking, and madeup)
    if n in type_dict.keys():
      type_dict[n] = type_dict[n] + abs(i - trained_model.predict(j.reshape(1, -1)))   # Append the MAE for every name given that type (so that you get a sum of MAEs; one for each name)
      type_counter[n] = type_counter[n] + 1                                     # And count the number of names given that type
    else:
      type_dict[n] = abs(i - trained_model.predict(j.reshape(1, -1), verbose=0))
      type_counter[n] = 1

  for i in type_dict.keys():
    globals()[f"mae_{i}"] = float(type_dict[i])/float(type_counter[i])          # Calculate the average MAE per name type: (sum of MAEs for name type / name counter for name type)

  if 'madeup' not in type_dict.keys():
    globals()[f"mae_madeup"] = None

  if 'talking' not in type_dict.keys():
    globals()[f"mae_talking"] = None

  if 'real' not in type_dict.keys():
    globals()[f"mae_real"] = None

  ##### Mean Only ##############################################################

  if lexical is True:
    mean_vec_array = np.full((len(x_test), 300), mean_vector_subwordless)
  else:
    mean_vec_array = np.full((len(x_test), 300), mean_vector)                     # Create a mean vector array with length = test_set_length, and width = 300

  mean_vec_mae_test = mean_absolute_error(y_test, trained_model.predict(mean_vec_array, verbose=0))# Retrieve the MAE for the mean vector array

  return mae_test, mae_madeup, mae_real, mae_talking, mean_vec_mae_test

In [None]:
column_list_metrics = ['dimension', 'analysis_type', 'mae_test', 'mae_madeup', 
                       'mae_real', 'mae_talking', 'mean_vec_mae_test', 'sd_mae_total', 
                       'sd_mae_madeup', 'sd_mae_real', 'sd_mae_talking', 'sd_mean_vec_mae']

column_list_pred = ['Name', 'NameType', 'TrueRating', 'AnalysisType', 'Prediction', 'MeanVecPrediction']

In [None]:
def prediction_csv_maker(names_df, ngram, ngram_lex, lexical, file_name):
  predictions_df = []                                                           # Create a list of lists that will be converted to a dataframe

  for i in sorted(names_df['name']):                                            # For every name in the dataframe
    i = i.lower()

    predictions_df.append([i, ngram[i][0], ngram[i][1], 'Ngram', ngram[i][2],   # Append the name, name type, and normal + mean_vector only predictions for 
                          ngram[i + '_mean_vector'][2]])                        # the three model types as a row to the list of lists
    
    predictions_df.append([i, ngram[i][0], ngram[i][1], 'NgramLex',
                          ngram_lex[i][2], ngram_lex[i + '_mean_vector'][2]])
    
    predictions_df.append([i, ngram[i][0], ngram[i][1], 'Lexical',
                          lexical[i][2], lexical[i + '_mean_vector'][2]])
    
  predictions_df = pd.DataFrame(predictions_df, columns=column_list_pred)       # Convert list of lists to DF
  predictions_df.to_csv(csv_path + file_name, index=False)                      # Save DF as .csv

In [None]:
def fnn_maker_and_evaluator(x_train, y_train, x_test, y_test, pred_dict, 
                            test_names, test_name_types, nodes, dropout, analysis_type, 
                            dictionary, lexical = None):
  # Input:
  # - x_train = array of embeddings used to train the model
  # - y_train = array of ratings used to train the model
  # - x_test = array of embeddings used to test the model
  # - y_test = array of ratings used to test the model
  # - pred_dict = a dictionary that will be filled with predictions per name
  # - test_names = dataframe containing the full names (i.e., not the embeddings) 
  # - test_name_types = dataframe containing the name type (real, madeup, talking)
  # - nodes = integer indicating the number of nodes to use in the hidden layer
  # - dropout = integer indicating the amount of dropout to use in the hidden layer

  # Process:
  # Train a sequential NN using the train set using fnn_maker(). Then, evaluate
  # the model using model_evaluator and return the values.

  # Output:
  # Too many to explain here. Basically, a bunch of metrics to test the model.

  #_, fnn_model = fnn_maker(x_train, y_train, x_test, y_test, nodes, dropout)
  
  fnn_model = fnn_maker(x_train, y_train, x_test, y_test, nodes, dropout)

  mae_test, mae_madeup, mae_real, mae_talking, mean_vec_mae_test \
  = model_evaluator(fnn_model, x_train, x_test, y_train, y_test, 
                    pred_dict, test_names, test_name_types, lexical)
  
  dictionary[analysis_type].append([mae_test, mae_madeup, mae_real, mae_talking, mean_vec_mae_test])

  return mae_test, mae_madeup, mae_real, mae_talking, mean_vec_mae_test

In [None]:
def splitter(df, rating, train_index, test_index):
  x_train_unfasttexted = df.iloc[train_index]                                 # Split the data into x_train and x_test
  x_test_unfasttexted = df.iloc[test_index]                   
  
  x_train_ngram = fasttext_xifyer_ngram_v2(x_train_unfasttexted)                 # Get the word embeddings
  x_train_ngram_lex = fasttext_xifyer_ngram_v2(x_train_unfasttexted, lexical = True)
  x_train_lexical = fasttext_xifyer_lexical_v2(x_train_unfasttexted)

  x_test_ngram = fasttext_xifyer_ngram_v2(x_test_unfasttexted)
  x_test_ngram_lex = fasttext_xifyer_ngram_v2(x_test_unfasttexted, lexical = True)
  x_test_lexical = fasttext_xifyer_lexical_v2(x_test_unfasttexted)

  y_train = df.iloc[train_index][rating]                                      # Split the data into y_train and y_test
  y_test = df.iloc[test_index][rating]

  test_names = df.iloc[test_index]['name']                                    # Get a list of the names in the test set
  test_name_types = df.iloc[test_index]['name_type']                          # Get a list of name types corresponding to the names in the test set

  return x_train_ngram, x_test_ngram, x_train_ngram_lex, x_test_ngram_lex, x_train_lexical, \
         x_test_lexical, y_train, y_test, test_names, test_name_types

In [None]:
def sd_calculator(dictionary, type_list):
  sd_dict_nn = {'ngram' : {'total' : [], 'madeup': [], 'real': [], 'talking': [], 'mean_vec_mae': []},
                'ngram_lex' : {'total' : [], 'madeup': [], 'real': [], 'talking': [], 'mean_vec_mae': []}, 
                'lexical': {'total' : [], 'madeup': [], 'real': [], 'talking': [], 'mean_vec_mae': []}}

  for analysis_type in type_list:    
    sd_mae_total = []
    sd_mae_madeup = []
    sd_mae_real = []
    sd_mae_talking = []

    sd_mean_vec_mae = []

    for iteration in dictionary[analysis_type]:
      sd_mae_total.append(iteration[0])
      sd_mae_madeup.append(iteration[1])
      sd_mae_real.append(iteration[2])
      sd_mae_talking.append(iteration[3])

      sd_mean_vec_mae.append(iteration[4])

    sd_dict_nn[analysis_type]['total'] = np.std(sd_mae_total)
    if analysis_type == 'ngram':
      sd_mae_madeup = list(filter(None, sd_mae_madeup))
      sd_dict_nn[analysis_type]['madeup'] = np.std(sd_mae_madeup)
    else:
      sd_dict_nn[analysis_type]['madeup'] = None
    
    sd_mae_real = list(filter(None, sd_mae_real))
    sd_dict_nn[analysis_type]['real'] = np.std(sd_mae_real)

    sd_mae_talking = list(filter(None, sd_mae_talking))
    sd_dict_nn[analysis_type]['talking'] = np.std(sd_mae_talking)

    sd_mae_sd_mean_vec_mae = list(filter(None, sd_mean_vec_mae))
    sd_dict_nn[analysis_type]['mean_vec_mae'] = np.std(sd_mean_vec_mae)

  return sd_dict_nn

In [None]:
def list_creator(dictionary):
  ngram_list = dictionary['ngram']
  ngram_lex_list = dictionary['ngram_lex']
  lexical_list = dictionary['lexical']                        

  ngram_list = [np.mean(x) for x in [list(filter(None, x)) for x in zip(*ngram_list)]]
  ngram_lex_list = [np.mean(x) for x in [list(filter(None, x)) for x in zip(*ngram_lex_list)]]                                                  
  lexical_list = [np.mean(x) for x in [list(filter(None, x)) for x in zip(*lexical_list)]] 

  return ngram_list, ngram_lex_list, lexical_list

In [None]:
def metrics_csv_maker(sd_dict_nn, ngram_list, ngram_lex_list, 
                      lexical_list, type_list, dimension):
  csv_df = []                                                                   # Create a list of lists that will be converted to a dataframe

  for value_list, analysis_type in zip([ngram_list, ngram_lex_list,             # Given the list of metrics for every model (combined, ngram, lexical)
                                        lexical_list], type_list):
    value_list.insert(0, analysis_type)                                         # Insert the name of the analysis type to the values (i.e., 'combined', etc.)
    value_list.insert(0, dimension)                                             # Insert the name of the dimension (i.e., 'age', 'gender', 'polarity') to the values
    value_list.append(sd_dict_nn[analysis_type]['total'])
    value_list.append(sd_dict_nn[analysis_type]['madeup'])
    value_list.append(sd_dict_nn[analysis_type]['real'])
    value_list.append(sd_dict_nn[analysis_type]['talking'])
    value_list.append(sd_dict_nn[analysis_type]['mean_vec_mae'])
    csv_df.append(value_list)                                                   # Add the list of values as a row to the DF list of lists
  
  csv_df = pd.DataFrame(csv_df, columns=column_list_metrics)                    # Convert the list of lists to a DF
  csv_df.to_csv(csv_path + dimension +'_nn_metrics.csv', index=False)           # Save the DF as a .csv file

In [None]:
def neural_network_k_folder(df, rating, dimension, dictionary, nodes, dropout):
  # Input:
  # - df = a dataframe with the name, name_type, and rating for the dimension at  
  # hand (i.e., age, gender, or polarity)
  # - rating = a string indicating what rating to extract from the df
  # - dimension = a string indicating what dimension is considered (i.e., 'age',
  # 'gender', or 'polarity')
  # - dictionary = an empty dictionary to store the MSE output by fnn_maker() in 
  # per configuration
  # - nodes = integer indicating the number of nodes to use in the hidden layer
  # - dropout = integer indicating the amount of dropout to use in the hidden layer

  # Process:
  # Given the df, get 5 train/test splits, and per fold, train a NN model using
  # fnn_maker() for the combined, ngram, and lexical data. Then, store these
  # metrics in a .csv file.

  # Output: 
  # - pred_dict_combined: y_true and y_pred (for several conditions) per name (combined)
  # - pred_dict_ngram: y_true and y_pred (for several conditions) per name (ngram)
  # - pred_dict_lexical: y_true and y_pred (for several conditions) per name (lexical)

  type_list = ['ngram', 'ngram_lex', 'lexical']                                 # List indicating the model type

  loocv = LeaveOneOut()                                                         # Set up stratified LOOCV
  
  pred_dict_ngram = {}                                                          # Set up two dictionaries to store the predicted y-values for the test names in
  pred_dict_ngram_lex = {}
  pred_dict_lexical = {}

  for train_index, test_index in loocv.split(df):                               # For every fold (stratified on the name type, i.e., real, madeup, or talking)
    x_train_ngram, x_test_ngram, x_train_ngram_lex, x_test_ngram_lex, x_train_lexical, \
    x_test_lexical, y_train, y_test, test_names, test_name_types = splitter(df, rating, train_index, test_index)                      

    fnn_maker_and_evaluator(x_train_ngram, y_train, x_test_ngram, y_test, pred_dict_ngram,
                            test_names, test_name_types, nodes, dropout, 'ngram', dictionary) # Train the NN given the fold, and return all of the variables of interest (ngram)
  
    fnn_maker_and_evaluator(x_train_ngram_lex, y_train, x_test_ngram_lex, y_test, 
                            pred_dict_ngram_lex, test_names, test_name_types, nodes,
                            dropout, 'ngram_lex', dictionary)                        # Train the NN given the fold, and return all of the variables of interest (ngram)

    fnn_maker_and_evaluator(x_train_lexical, y_train, x_test_lexical, y_test, 
                            pred_dict_lexical, test_names, test_name_types, nodes,
                            dropout, 'lexical', dictionary, lexical = True)         # Train the NN given the fold, and return all of the variables of interest (lexical)

  sd_dict_nn = sd_calculator(dictionary, type_list)                             # calculate standard deviations

  ngram_list, ngram_lex_list, lexical_list = list_creator(dictionary)           # create list of metrics for all analysis types

  metrics_csv_maker(sd_dict_nn, ngram_list, ngram_lex_list, lexical_list, type_list, dimension)     # Call function that creates CSV output file for the metrics          

  return pred_dict_ngram, pred_dict_ngram_lex, pred_dict_lexical

## Running the Functions

In [None]:
age_dict_nn_final = {'ngram' : [], 'ngram_lex' : [], 'lexical': []}          # initialize the score dictionary for age

age_pred_dict_ngram_nn, age_pred_dict_ngram_lex_nn, \
age_pred_dict_lexical_nn = \
neural_network_k_folder(df_age, 'rating.mean_age', 'age', 
                        age_dict_nn_final, 256, 0.5)                            # Perform the 5-fold cross validation and save the metrics as a .csv file

prediction_csv_maker(df_age, age_pred_dict_ngram_nn, age_pred_dict_ngram_lex_nn, 
                     age_pred_dict_lexical_nn, 'age_pred_nn.csv')               # Save predictions per name and model type to a csv

In [None]:
gender_dict_nn_final = {'ngram' : [], 'ngram_lex' : [], 'lexical': []}          # initialize the score dictionary for gender

gender_pred_dict_ngram_nn, gender_pred_dict_ngram_lex_nn, \
gender_pred_dict_lexical_nn = \
neural_network_k_folder(df_gender, 'rating.mean_gender', 'gender', 
                        gender_dict_nn_final, 512, 0.2)                         # Perform the 5-fold cross validation and save the metrics as a .csv file

prediction_csv_maker(df_gender, gender_pred_dict_ngram_nn, gender_pred_dict_ngram_lex_nn, 
                     gender_pred_dict_lexical_nn, 'gender_pred_nn.csv')         # Save predictions per name and model type to a csv

In [None]:
polarity_dict_nn_final = {'ngram' : [], 'ngram_lex' : [], 'lexical': []}        # initialize the score dictionary for polarity

polarity_pred_dict_ngram_nn, polarity_pred_dict_ngram_lex_nn, \
polarity_pred_dict_lexical_nn = \
neural_network_k_folder(df_polarity, 'rating.mean_valence', 'polarity', 
                        polarity_dict_nn_final, 512, 0.5)                       # Perform the 5-fold cross validation and save the metrics as a .csv file

prediction_csv_maker(df_polarity, polarity_pred_dict_ngram_nn, polarity_pred_dict_ngram_lex_nn, 
                     polarity_pred_dict_lexical_nn, 'polarity_pred_nn.csv')     # Save predictions per name and model type to a csv