Loads in the 2 files. I dropped the columns I felt would not be used throughout the pipeline. Users are also filtered based on if they have a Twitter account and if their Nationality mentions Filipino anywhere. The *3rd* file is the tweets with the auto-generated tweets removed.

In [1]:
from ftfy import fix_encoding
import pandas as pd
import re
import csv
# from tweetokenize import Tokenizer

traits = [
	"Openness", 
	"Extraversion",
	"Conscientiousness",
	"Agreeableness",
	"Neuroticism"]

def fix_encode(x):
    return fix_encoding(x)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

df_twt_tweets = pd.read_csv(
    "twitter_tweet.csv",
    encoding="utf-8",
    sep=",",
    quoting=csv.QUOTE_ALL)

# df_twt_tweets.to_csv(
#     r'C:/Users/LENOVO/Desktop/MS/CSC802M/lab-datasets/final/twitter_tweet.csv',
#     quoting=csv.QUOTE_ALL)

df_filtered_users = pd.read_csv(
    "filtered_users.csv",
    encoding="utf-8",
    sep=",",
    quoting=csv.QUOTE_ALL)

df_processed_tweets = pd.read_csv(
    "processed_tweets_dataset_nonconcat.csv",
    encoding="utf-8",
    sep=",",
    quoting=csv.QUOTE_ALL)

# --- DROP --- HasInstagram, and Est. Date of Participation from filtered users
df_filtered_users.drop(['HasInstagram', 'Est. Date of Participation'], axis = 1, inplace=True)

# --- DROP --- Date_Created and Is_Truncated from tweets 
df_twt_tweets.drop(['Date_Created', 'Is_Truncated'], axis = 1, inplace=True)

# --- DROP --- rows wherein => HasTwitter is FALSE
df_filtered_users.drop(df_filtered_users[df_filtered_users['HasTwitter'] == False].index, inplace=True)

# --- DROP --- rows wherein => Nationality is NOT Filipino
#NOTE: JUST a precaution bc i can't be too sure lmao
df_filtered_users.drop(df_filtered_users[~df_filtered_users['Nationality'].str.contains('Filipino')].index, inplace=True)

Some descriptive statistics about both users and tweets I thought would be useful. The stats pre- and post-removal of auto-generated tweets is not included.

In [None]:
# --- [USERS] ---
# --- PRINT --- COUNT
print("Total number of user: " + str(df_filtered_users['HasTwitter'].count()))
print("--------------------")

# --- PRINT --- Nationality
# comment: "Mixed-Filipino" as containing Filipino + another nationality; wrote it to document nalang after counting the MF  manually
print("Nationality: ")
print("Filipino: 2263")
print("Mixed-Filipino: 20")
print("--------------------")

# --- PRINT --- Sex
print("Sex: ")
print(df_filtered_users['Sex'].value_counts(ascending=True))
print("--------------------")

# --- PRINT --- Age
print("Age: ")
print(df_filtered_users['Age'].describe())
print("--------------------")

# --- PRINT --- Age
# comment: didn't see the need to make intervals for age (maybe); left it just in case
# print("Age: ")
# print(df_filtered_users['Age'].value_counts(bins=4))
# print("--------------------")

# --- PRINT --- 5 TRAITS
for trait in traits:
    print(trait)
    print("Mean: " + str(df_filtered_users[trait].mean()))
    print("SD: " + str(df_filtered_users[trait].std()))
    print("Min: " + str(df_filtered_users[trait].min()))
    print("Max: " + str(df_filtered_users[trait].max()))
    print("--------------------")

# --- [TWEETS] ---
# --- PRINT --- COUNT
# comment: not sure how to categorize it....
print("Total number of user: ")
print(df_twt_tweets['Language'].value_counts(ascending=True))
print("--------------------")
# ===== END DESCRIPTIVE STATS =====

This section drops any users in the twitter_tweet file that does not pass the 100-tweet threshold. It exports the final dataframe into a "concatex.csv" file. This file (& the DF) serves as the main dataset for the tweets.
The csv file should contain only the UserID and the Concatext columns. 

In [3]:
# made a copy of the tweets DF so original is untouched; drop the TweetID col
df_temp_tweets = df_twt_tweets[['TweetID', 'UserID', 'Text']].copy()
df_temp_tweets = df_temp_tweets.drop("TweetID", axis=1)

#fill blanks and fix encoding
df_temp_tweets['Text'].fillna(" ", inplace=True)
df_temp_tweets['Text'] = df_temp_tweets['Text'].apply(lambda x: fix_encode(x))

#get tweet count based on how many instances of a certain UserID is seen
tweet_count = pd.DataFrame()
tweet_count['TweetCount'] = df_temp_tweets['UserID'].value_counts()

#remove users with <100 tweets (which is based on how many times a user id is found)
tweet_count.drop(tweet_count[tweet_count['TweetCount'] < 100].index, inplace=True)

#index name change. make to new column. reset index.
tweet_count.index.name = 'UserID'
tweet_count = tweet_count.reset_index()

#the list of users who passed the threshold; sorts, then compare aginst UserIDs in df_temp_tweets; drop if not found 
users = tweet_count['UserID'].copy()
df_temp_tweets.sort_values(by=['UserID'], inplace=True)
df_temp_tweets = df_temp_tweets[df_temp_tweets['UserID'].isin(users)]

#the MAIN dataset for tweets
df_dataset_tweets = pd.DataFrame()
df_dataset_tweets['UserID'] = tweet_count['UserID'].copy()

#concatenate all the text of the same UserID then copy
df_temp_tweets['expand'] = df_temp_tweets.apply(lambda x: " ".join([x['Text']]), axis = 1)
df_dataset_tweets = df_temp_tweets.groupby('UserID')['expand'].apply(list).to_frame().copy()

#set the index. make list of strings into 1 string. remove the expand column
df_dataset_tweets.reset_index(inplace = True)
df_dataset_tweets['Concatext'] = df_dataset_tweets['expand'].str.join(" ")
df_dataset_tweets = df_dataset_tweets.drop('expand', axis=1)

# df_temp_tweets.drop(['TweetID'], axis=1)
# df_temp_tweets['Text'].fillna("", inplace=True)

df_dataset_tweets.to_csv(
    r'concatex.csv',
    encoding="utf-8",
    sep=",",
    quoting=csv.QUOTE_ALL)


Drop any Users in ***df_dataset_tweets*** that are not found in ***filtered_users***. /n
The next line drops the other way around. 

In [4]:
filtered_users = df_filtered_users['UserID'].copy()
df_dataset_tweets = df_dataset_tweets[df_dataset_tweets['UserID'].isin(filtered_users)]

print(df_filtered_users.shape)
print(df_dataset_tweets.shape)

(2283, 10)
(2112, 2)


**FOR TESTING PURPOSES ONLY**

In [None]:
#NOTE: printing the head and tail of both UserID and Concatext: everything seems okay? PAGDADASAL KO 'TO
# print(df_dataset_tweets['UserID'].head(15))
# print(df_dataset_tweets['UserID'].tail(15))
# print(df_dataset_tweets['Concatext'].head(1))
# print(df_dataset_tweets['Concatext'].tail(1))

Makes a df of each personality trait, containing only the UserID and the respective trait. Performs an **INNER JOIN** so that only rows found in both df_filtered_users and df_dataset_tweets are copied into the new DataFrames. The key used is *UserID*. **** WALA NA SILBI FOR NOW ****

In [5]:
df_personality_master = pd.merge(df_filtered_users, df_dataset_tweets, on='UserID', how='inner')

df_personality_Openness = df_personality_master[['UserID', 'Openness', 'Concatext']].copy()
df_personality_Conscientiousness = df_personality_master[['UserID', 'Conscientiousness', 'Concatext']].copy()
df_personality_Extraversion = df_personality_master[['UserID', 'Extraversion', 'Concatext']].copy()
df_personality_Agreeableness = df_personality_master[['UserID', 'Agreeableness', 'Concatext']].copy()
df_personality_Neuroticism = df_personality_master[['UserID', 'Neuroticism', 'Concatext']].copy()

Model Building and Evaluation. Algorithms to use are LIN, RIN, and SVR (w/ RBF kernel)

In [None]:
import numpy as np
from numpy import mean
from numpy import arange
from numpy import std
from numpy import absolute
from sklearn.feature_extraction.text import TfidfVectorizer
# from tweetokenize import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MaxAbsScaler
from sklearn.dummy import DummyRegressor
import pandas as pd 

# TOKENIZER = Tokenizer(
# 	lowercase = False,
# 	allcapskeep = True,
# 	normalize = False,
# 	usernames = '__USERNAME__',
# 	urls = '__URL__',
# 	hashtags = '__HASHTAG__',
# 	numbers = "__NUMBER__",
# 	ignorequotes = False,
# 	ignorestopwords = False)

clfs = [
    DummyRegressor(strategy='mean'),
    LinearRegression(),
    Ridge(),
    SVR()]

results = []
rr_param_grid = [{'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}]
svr_param_grid = [{'kernel': ['rbf'],'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}]

cross_val_k_folds = KFold(
            n_splits=10,
            random_state=10,
            shuffle=True)

eval_scoring={'mse': 'neg_mean_squared_error', 'rsq': 'r2'}

#root mean squared error
def eval_compute_rmse(test, pred):
    return np.sqrt(mean_squared_error(test, pred))

#r square
def eval_compute_rsq(test, pred):
    return r2_score(test, pred)

def cross_validate(trait, clf, x_train, y_train):
    eval_rmse = np.array([])
    eval_rsq = np.array([])
    for train_index, test_index in cross_val_k_folds.split(x_train):
        cv_x_train, cv_x_test = x_train[train_index], x_train[test_index]
        cv_y_train, cv_y_test = y_train[train_index], y_train[test_index]
        
        model = clf.fit(cv_x_train, cv_y_train)
        cv_y_pred = clf.predict(cv_x_test)

        eval_rmse = np.append(eval_rmse, [eval_compute_rmse(cv_y_test, cv_y_pred)])
        eval_rsq = np.append(eval_rsq, [eval_compute_rsq(cv_y_test, cv_y_pred)])
    
    results = {
		'type': "cross_validation",
		'trait': trait,
		'clf': str(clf)[:str(clf).find('(')],
		'rmse': eval_rmse.mean(),
		'rmse_std': eval_rmse.std()
    }
    
    return results

def cross_validate_rr(trait, clf, x_train, y_train, params_grid):
    scorer = make_scorer(eval_compute_rmse, greater_is_better=False)
    rid_gs = GridSearchCV(clf, params_grid, cv=cross_val_k_folds, scoring= scorer)
    rid_gs.fit(x_train, y_train)
    results = {
        'type': 'cross_validation',
        'trait': trait,
        'clf': str(clf)[:str(clf).find('(')],
        'rmse': rid_gs.cv_results_['mean_test_score'][rid_gs.best_index_],
        'rmse_std': rid_gs.cv_results_['std_test_score'][rid_gs.best_index_]
    }
    return results, rid_gs.best_params_

def cross_validate_svr(trait, clf, x_train, y_train, params_grid):
    scorer = make_scorer(eval_compute_rmse, greater_is_better=False)
    svr_gs = GridSearchCV(clf, params_grid, cv=cross_val_k_folds, scoring= scorer, verbose=2)
    svr_gs.fit(x_train, y_train)
    results = {
        'type': 'cross_validation',
        'trait': trait,
        'clf': str(clf)[:str(clf).find('(')],
        'rmse': svr_gs.cv_results_['mean_test_score'][svr_gs.best_index_],
        'rmse_std': svr_gs.cv_results_['std_test_score'][svr_gs.best_index_]
    }
    return results, svr_gs.best_params_

df_model = pd.DataFrame()
labels = df_personality_master[['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']]

for trait in traits:
    #train test split
    x_train, x_test, y_train, y_test = train_test_split(
        df_dataset_tweets.Concatext,
        labels[trait],
        test_size = 0.4, 
        random_state = 101)

    TFIDF_vectorizer = TfidfVectorizer(
        min_df = 0.05, 
        max_features = 3000
    )

    SCALER = MaxAbsScaler()

    x_train = TFIDF_vectorizer.fit_transform(x_train)
    x_test = TFIDF_vectorizer.fit_transform(x_test)

    x_train = SCALER.fit_transform(x_train)
    x_test = SCALER.fit_transform(x_test)

    y_train = SCALER.fit_transform(y_train.to_numpy().reshape(-1, 1)) #().reshape(-1, 1)
    y_test = SCALER.transform(y_test.to_numpy().reshape(-1, 1)) #().reshape(-1, 1)

    for clf in clfs:
        if str(clf)[:str(clf).find('(')] == "Ridge":
            cv_result, cv_best_params = cross_validate_rr(trait, clf, x_train, y_train, rr_param_grid)
            clf.set_params(**cv_best_params)
            print(cv_best_params)
        elif str(clf)[:str(clf).find('(')] != "SVR":
            cv_result = cross_validate(trait, clf, x_train, y_train)
        else: 
            cv_result, cv_best_params = cross_validate_svr(trait, clf, x_train, y_train, svr_param_grid)
            clf.set_params(**cv_best_params)
            print(cv_best_params)
            
        results.append(cv_result)

        print("-------------- TESTING --------------")
        # actual testing
        model = clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)

        results.append({
            'type': "testing",
            'trait': trait,
            'clf': str(clf)[:str(clf).find('(')],
            'rmse': eval_compute_rmse(y_test, y_pred),
            'r2': eval_compute_rsq(y_test, y_pred)
        })

        df_model = pd.DataFrame(results)
        df_model.set_index("trait", inplace=True)
        df_model.to_csv(
            "model_results_tfidf.csv",
            quoting=csv.QUOTE_ALL,
            sep=',',
            encoding='utf-8'
        )

In [7]:
# print(df_processed_tweets.shape)
df_processed_tweets = df_processed_tweets.drop("TweetID", axis = 1)

#fill blanks and fix encoding
df_processed_tweets['Text'].fillna(" ", inplace=True)
df_processed_tweets['Text'] = df_processed_tweets['Text'].apply(lambda x: fix_encode(x))

#get tweet count based on how many instances of a certain UserID is seen
processed_tweet_count = pd.DataFrame()
processed_tweet_count['TweetCount'] = df_processed_tweets['UserID'].value_counts()

#remove users with <100 tweets (which is based on how many times a user id is found)
processed_tweet_count.drop(processed_tweet_count[processed_tweet_count['TweetCount'] < 100].index, inplace=True)

#index name change. make to new column. reset index.
processed_tweet_count.index.name = 'UserID'
processed_tweet_count = processed_tweet_count.reset_index()

#the list of users who passed the threshold; sorts, then compare aginst UserIDs in df_temp_tweets; drop if not found 
processedtwt_users = tweet_count['UserID'].copy()
df_processed_tweets.sort_values(by=['UserID'], inplace=True)
df_processed_tweets = df_processed_tweets[df_processed_tweets['UserID'].isin(processedtwt_users)]

#the MAIN dataset for PROCtweets
df_dataset_processed_tweets = pd.DataFrame()
df_dataset_processed_tweets['UserID'] = processed_tweet_count['UserID'].copy()

#concatenate all the text of the same UserID then copy
df_processed_tweets['expand'] = df_processed_tweets.apply(lambda x: " ".join([x['Text']]), axis = 1)
df_dataset_processed_tweets = df_processed_tweets.groupby('UserID')['expand'].apply(list).to_frame().copy()

#set the index. make list of strings into 1 string. remove the expand column
df_dataset_processed_tweets.reset_index(inplace = True)
df_dataset_processed_tweets['Concatext'] = df_dataset_processed_tweets['expand'].str.join(" ")
df_dataset_processed_tweets = df_dataset_processed_tweets.drop('expand', axis=1)

# df_temp_tweets.drop(['TweetID'], axis=1)
# df_temp_tweets['Text'].fillna("", inplace=True)

df_dataset_processed_tweets.to_csv(
    r'concatex_processed.csv',
    encoding="utf-8",
    sep=",",
    quoting=csv.QUOTE_ALL)

In [None]:
results.clear()

df_model_processedtwt = pd.DataFrame()

for trait in traits:
    #train test split
    x_train, x_test, y_train, y_test = train_test_split(
        df_dataset_processed_tweets.Concatext,
        labels[trait],
        test_size = 0.4, 
        random_state = 101)

    TFIDF_vectorizer = TfidfVectorizer(
        min_df = 0.05, 
        max_features = 3000
    )

    SCALER = MaxAbsScaler()

    x_train = TFIDF_vectorizer.fit_transform(x_train)
    x_test = TFIDF_vectorizer.fit_transform(x_test)

    x_train = SCALER.fit_transform(x_train)
    x_test = SCALER.fit_transform(x_test)

    y_train = SCALER.fit_transform(y_train.to_numpy().reshape(-1, 1)) #().reshape(-1, 1)
    y_test = SCALER.transform(y_test.to_numpy().reshape(-1, 1)) #().reshape(-1, 1)

    for clf in clfs:
        if str(clf)[:str(clf).find('(')] == "Ridge":
            cv_result, cv_best_params = cross_validate_rr(trait, clf, x_train, y_train, rr_param_grid)
            clf.set_params(**cv_best_params)
            print(cv_best_params)
        elif str(clf)[:str(clf).find('(')] != "SVR":
            cv_result = cross_validate(trait, clf, x_train, y_train)
        else: 
            cv_result, cv_best_params = cross_validate_svr(trait, clf, x_train, y_train, svr_param_grid)
            clf.set_params(**cv_best_params)
            print(cv_best_params)
            
        results.append(cv_result)

        print("-------------- TESTING --------------")
        # actual testing
        model_processedtwt = clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)

        results.append({
            'type': "testing",
            'trait': trait,
            'clf': str(clf)[:str(clf).find('(')],
            'rmse': eval_compute_rmse(y_test, y_pred),
            'r2': eval_compute_rsq(y_test, y_pred)
        })

        df_model_processedtwt = pd.DataFrame(results)
        df_model_processedtwt.set_index("trait", inplace=True)
        df_model_processedtwt.to_csv(
            "model_results_tfidf_processed_nopiv.csv",
            quoting=csv.QUOTE_ALL,
            sep=',',
            encoding='utf-8'
        )

In [None]:
print_results = pd.read_csv(
    "model_results_tfidf.csv",
    quoting=csv.QUOTE_ALL,
    sep=",",
    encoding="utf-8"
)

print(print_results)