# Final Project

Team name: Stack Overflow

Members:
Ki Jeong
Moneeb Abu-Esba
Omar Diaz
Luke Lopez

# Project Overview

Our project dataset(Gold_Annotations.xlsx) had 3 columns: Comment, Tech, and Subtopic.

Comment: This contained the user's reddit post
Tech: This contained either "tech" or "nontech" depending on the post
Subtopic: This contained "education", "mobility", "recreation", or "other" depending on the post


# Tech Variable

First, we will address the Tech variable. We are using cross-validation for simplicity.



In [94]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV

data = pd.read_excel('Gold_Annotations.xlsx')
X = data['Comment']
y = data['Tech']


X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

class LexiconClassifier():
    def __init__(self):
        self.tech_words = ["computer", "phone", "app", "internet"]
        
    def tech_predict(self, sentence):
        num_tech_words = 0
        for word in sentence.lower().split():
            if word in self.tech_words:
                num_tech_words += 1
        
        tech_pred = 'nontech'        
        if num_tech_words > 0:
            tech_pred = 'tech'
        return tech_pred

lex_class = LexiconClassifier()
tech_preds = []

tmp = ''
for feature in X_train:
    tmp = lex_class.tech_predict(feature)
    tech_preds.append(tmp)


precision_macro = precision_score(y_train, tech_preds, average='macro')
recall_macro = recall_score(y_train, tech_preds, average='macro')
f1_macro = f1_score(y_train, tech_preds, average='macro')
precision_micro = precision_score(y_train, tech_preds, average='micro')
recall_micro = recall_score(y_train, tech_preds, average='micro')
f1_micro = f1_score(y_train, tech_preds, average='micro')

print("Training Macro Precision:", precision_macro)
print("Training Macro Recall:", recall_macro)
print("Training Macro F1 Score:", f1_macro)
print("Training Micro Precision:", precision_micro)
print("Training Micro Recall:", recall_micro)
print("Training Micro F1 Score:", f1_micro)



vec = CountVectorizer(ngram_range=(1,1))
X_train_features = vec.fit_transform(X_train) # This should be a matrix
X_val_features = vec.transform(X_val) # This should be a matrix
X_test_features = vec.transform(X_test) # This should be a matrix

model = LinearSVC(dual='auto')
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
clf = GridSearchCV(model, params, scoring='f1_macro', cv=5)
clf.fit(X_train_features, y_train)

y_val_pred = clf.predict(X_val_features)


precision_macro = precision_score(y_test, y_val_pred, average='macro')
recall_macro = recall_score(y_test, y_val_pred, average='macro')
f1_macro = f1_score(y_test, y_val_pred, average='macro')
precision_micro = precision_score(y_test, y_val_pred, average='micro')
recall_micro = recall_score(y_test, y_val_pred, average='micro')
f1_micro = f1_score(y_test, y_val_pred, average='micro')

print("Validation Macro Precision:", precision_macro)
print("Validation Macro Recall:", recall_macro)
print("Validation Macro F1 Score:", f1_macro)
print("Validation Micro Precision:", precision_micro)
print("Validation Micro Recall:", recall_micro)
print("Validation Micro F1 Score:", f1_micro)


y_test_pred = clf.predict(X_test_features)

precision_macro = precision_score(y_test, y_test_pred, average='macro')
recall_macro = recall_score(y_test, y_test_pred, average='macro')
f1_macro = f1_score(y_test, y_test_pred, average='macro')
precision_micro = precision_score(y_test, y_test_pred, average='micro')
recall_micro = recall_score(y_test, y_test_pred, average='micro')
f1_micro = f1_score(y_test, y_test_pred, average='micro')

print("Test Macro Precision:", precision_macro)
print("Test Macro Recall:", recall_macro)
print("Test Macro F1 Score:", f1_macro)
print("Test Micro Precision:", precision_micro)
print("Test Micro Recall:", recall_micro)
print("Test Micro F1 Score:", f1_micro)


Training Macro Precision: 0.5399115741851459
Training Macro Recall: 0.5311089145940748
Training Macro F1 Score: 0.4430216077778054
Training Micro Precision: 0.4514285714285714
Training Micro Recall: 0.4514285714285714
Training Micro F1 Score: 0.4514285714285714




Validation Macro Precision: 0.45495495495495497
Validation Macro Recall: 0.4638888888888889
Validation Macro F1 Score: 0.44972109151213624
Validation Micro Precision: 0.5133333333333333
Validation Micro Recall: 0.5133333333333333
Validation Micro F1 Score: 0.5133333333333333
Test Macro Precision: 0.5657894736842105
Test Macro Recall: 0.55
Test Macro F1 Score: 0.5404411764705882
Test Micro Precision: 0.6
Test Micro Recall: 0.6
Test Micro F1 Score: 0.6




# Subtopic Variable

Next, the Subtopic variable is addressed. The values can be one of four: Education, Mobility, Recreation, or Other.

In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV

data = pd.read_excel('Gold_Annotations.xlsx')
X = data['Comment']
y = data['Subtopic']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

class LexiconClassifier():
    def __init__(self):
        self.education_words = ["school", "utsa", "learn", "degree"]
        self.mobility_words = ["road", "drive", "car", "traffic"]
        self.recreation_words = ["park", "play", "club", "drink"]
        
    def subtopic_predict(self, sentence):
        num_education_words = 0
        num_mobility_words = 0     
        num_recreation_words = 0     
        for word in sentence.lower().split():
            if word in self.education_words:
                num_education_words += 1
            if word in self.mobility_words:
                num_mobility_words += 1
            if word in self.recreation_words:
                num_recreation_words += 1
        
        subtopic_pred = 'other'        
        if num_education_words > num_mobility_words and num_education_words > num_recreation_words:
            subtopic_pred = 'education'
        elif num_mobility_words > num_education_words and num_mobility_words > num_recreation_words:
            subtopic_pred = 'mobility'
        elif num_recreation_words > num_education_words and num_recreation_words > num_mobility_words:
            subtopic_pred = 'recreation'

        return subtopic_pred
        

lex_class = LexiconClassifier()
subtopic_preds = []

tmp = ''
for feature in X_train:
    tmp = lex_class.subtopic_predict(feature)
    subtopic_preds.append(tmp)

precision_macro = precision_score(y_train, subtopic_preds, average='macro')
recall_macro = recall_score(y_train, subtopic_preds, average='macro')
f1_macro = f1_score(y_train, subtopic_preds, average='macro')
precision_micro = precision_score(y_train, subtopic_preds, average='micro')
recall_micro = recall_score(y_train, subtopic_preds, average='micro')
f1_micro = f1_score(y_train, subtopic_preds, average='micro')

print("Validation Macro Precision:", precision_macro)
print("Validation Macro Recall:", recall_macro)
print("Validation Macro F1 Score:", f1_macro)
print("Validation Micro Precision:", precision_micro)
print("Validation Micro Recall:", recall_micro)
print("Validation Micro F1 Score:", f1_micro)



vec = CountVectorizer(ngram_range=(1,1))
X_train_features = vec.fit_transform(X_train) # This should be a matrix
X_test_features = vec.transform(X_test) # This should be a matrix

model = LinearSVC(dual='auto')
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
clf = GridSearchCV(model, params, scoring='f1_macro', cv=5)
clf.fit(X_train_features, y_train)

y_test_pred = clf.predict(X_test_features)

precision_macro = precision_score(y_test, y_test_pred, average='macro')
recall_macro = recall_score(y_test, y_test_pred, average='macro')
f1_macro = f1_score(y_test, y_test_pred, average='macro')
precision_micro = precision_score(y_test, y_test_pred, average='micro')
recall_micro = recall_score(y_test, y_test_pred, average='micro')
f1_micro = f1_score(y_test, y_test_pred, average='micro')

print("Test Macro Precision:", precision_macro)
print("Test Macro Recall:", recall_macro)
print("Test Macro F1 Score:", f1_macro)
print("Test Micro Precision:", precision_micro)
print("Test Micro Recall:", recall_micro)
print("Test Micro F1 Score:", f1_micro)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Macro Precision: 0.0
Validation Macro Recall: 0.0
Validation Macro F1 Score: 0.0
Validation Micro Precision: 0.0
Validation Micro Recall: 0.0
Validation Micro F1 Score: 0.0




Test Macro Precision: 0.37560390097524377
Test Macro Recall: 0.27915571196319106
Test Macro F1 Score: 0.2984903777495987
Test Micro Precision: 0.704
Test Micro Recall: 0.704
Test Micro F1 Score: 0.704


  _warn_prf(average, modifier, msg_start, len(result))
