In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import pandas as pd


In [2]:
enc = LabelEncoder()

sports_tweets = pd.read_csv("CleanedSportsTweets.csv")

# Encode the Sport feature using a new column labeled "Sport_Cat"
sports_tweets["Sport_Cat"] = enc.fit_transform(sports_tweets["Sport"])

In [3]:
x = sports_tweets["Tweet"]
y = sports_tweets["Sport_Cat"]

# Create the Vectorizer that uses both unigrams and bigrams with tuple (1, 2)
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))

In [4]:
# Change X to include those unigrams and bigrams
x_fit = tfidf.fit_transform(x)

# Split the data
# x_train, x_test, y_train, y_test = train_test_split(x_fit, y, test_size=0.2)
skfold = StratifiedKFold(n_splits = 10)

In [5]:
lin_svc = LinearSVC()
rbf_svc = SVC()
knn = KNeighborsClassifier()
log_reg = LogisticRegression()
rfc = RandomForestClassifier(n_estimators=50)
mlp = MLPClassifier()

In [6]:
def average(arr) -> float:
    # Returns the average of the array, arr, as a float
    return sum(arr) / len(arr)

In [7]:
# # Test the data
lin_svc_scores = cross_val_score(lin_svc, x_fit, y, cv=skfold)
rbf_svc_scores = cross_val_score(rbf_svc, x_fit, y, cv=skfold)
knn_scores = cross_val_score(knn, x_fit, y, cv=skfold)
log_reg_scores = cross_val_score(log_reg, x_fit, y, cv=skfold)
rfc_scores = cross_val_score(rfc, x_fit, y, cv=skfold)
mlp_scores = cross_val_score(mlp, x_fit, y, cv=skfold)




In [14]:
def choose_classifier(lin_svc, rbf_svc, knn, log_reg, rfc, mlp):
    empty_dict = {}
    empty_dict["lin_svc"] = average(lin_svc_scores)
    empty_dict["rbf_svc"] = average(rbf_svc_scores)
    empty_dict["knn"] = average(knn_scores)
    empty_dict["log"] = average(log_reg_scores)
    empty_dict["rfc"] = average(rfc_scores)
    empty_dict["mlp"] = average(mlp_scores)

    return string_to_classifier(max(empty_dict, key=empty_dict.get))

def string_to_classifier(string):
    if string == "lin_svc":
        return lin_svc
    elif string == "rbf_svc":
        return rbf_svc
    elif string == "knn":
        return knn
    elif string == "log":
        return log_reg
    elif string == "rfc":
        return rfc
    elif string == "mlp":
        return mlp

# Choose correct classifier
classifier = choose_classifier(lin_svc, rbf_svc, knn, log_reg, rfc, mlp)

In [24]:
# Once classifier is chosen then split the data into training and testing

x_train, x_test, y_train, y_test = train_test_split(x_fit, y, test_size=0.2, random_state=2)

In [25]:
x = classifier.fit(x_train, y_train)
x.score(x_test, y_test)

y_pred = classifier.predict(x_test)


In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.74      0.80        99
           1       1.00      0.57      0.72        30
           2       0.85      0.98      0.91       256
           3       1.00      0.60      0.75         5
           4       0.88      0.70      0.78        60

    accuracy                           0.86       450
   macro avg       0.92      0.72      0.79       450
weighted avg       0.87      0.86      0.85       450

