In [0]:
# Import libs

import pandas as pd
import re
import numpy as np
from nltk.tokenize import WordPunctTokenizer, TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
# Load Training, Dev and Test Data

def loadData(filepath):
  data = pd.read_csv(filepath)
  # Currently working on region level predictor
  data.drop(['lat','long','state'],axis=1,inplace=True)
  # Filter and remove anomaly data
  data = data[data.region != 'MX']  # Remove 'Mexico' records - Due to Reverse geocoding
  data = data[data.region != 'CA']  # Remove 'Canada' records - Due to Reverse geocoding
  data.dropna(inplace=True)
  data.reset_index(drop=True,inplace=True)
  return data

# train file and dev file has columns tweet, lat, long, state, region
# test file has columns uid (userid), tweet, lat, long, state, region
train = loadData("Path to train .csv file")
dev = loadData("Path to dev .csv file")
test = loadData("Path to test .csv file")
tweet_test = test.drop('uid', axis=1)  # Tweet level test
user_test = test.drop('tweet', axis=1).drop_duplicates()  # User level test

# print(train.head(5))
# print(train.iloc[0])
# print(train.region.unique())
# train.info()
print("Train shape: "+ str(train.shape))
print("Dev shape: "+ str(dev.shape))
print("Test shape: "+ str(test.shape))
print("Tweet Test shape: "+ str(tweet_test.shape))
print("User Test shape: "+ str(user_test.shape))

Train shape: (218147, 2)
Dev shape: (74079, 2)
Test shape: (74382, 3)
Tweet Test shape: (74382, 2)
User Test shape: (1858, 2)


In [0]:
# Data Preparation

# Tweet cleaning
tt = TweetTokenizer(strip_handles=True, reduce_len=True)
pat1 = r'@[A-Za-z0-9_]+'  # To remove '@' mentions
pat2 = r'https?://[^ ]+'  # To remove web links starting with http
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'  # To remove web links starting without http
'''negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')'''

def tweet_cleaner(text):
    stripped = re.sub(combined_pat, '', text)
    stripped = re.sub(www_pat, '', stripped)
    lower_case = stripped.lower()
    # neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    # letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # Will tokenize and join together to remove unneccessary white spaces
    # words = [x for x  in wpt.tokenize(letters_only) if len(x) > 1]
    words = [x for x  in tt.tokenize(lower_case) if len(x) > 1]
    return (" ".join(words)).strip()
  
# print(tweet_cleaner(" www.msundarv.com don't First batting also dhik dhik, chasing also dhik dhik! Finger nails, toe nails, no bias! 😷 #Yellove #WhistlePodu #CSKvRR 🦁💛"))

train.tweet = train.tweet.apply(tweet_cleaner)
# print(train.head(5))
# print(train.iloc[0])
dev.tweet = dev.tweet.apply(tweet_cleaner)
tweet_test.tweet = tweet_test.tweet.apply(tweet_cleaner)

print("Train shape: "+ str(train.shape))
print("Dev shape: "+ str(dev.shape))
print("Test shape: "+ str(tweet_test.shape))

Train shape: (218147, 2)
Dev shape: (74079, 2)
Test shape: (74382, 2)


In [0]:
# Vectorization

tweet_vectorizer = TfidfVectorizer(max_features = 5000,ngram_range = (1,3),stop_words='english')

X_train = tweet_vectorizer.fit_transform(train.tweet)
X_dev = tweet_vectorizer.transform(dev.tweet)
X_test = tweet_vectorizer.transform(tweet_test.tweet)

print("X_Train Shape: "+str(X_train.shape))
print("X_Dev Shape: "+str(X_dev.shape))
print("X_Test Shape: "+str(X_test.shape))

X_Train Shape: (218147, 5000)
X_Dev Shape: (74079, 5000)
X_Test Shape: (74382, 5000)


In [0]:
# Baseline - NB Classifier

from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, train.region)

nb_train_score = nb_classifier.score(X_train, train.region)
nb_dev_score = nb_classifier.score(X_dev, dev.region)
nb_test_score = nb_classifier.score(X_test, test.region)

# Tweet level Accuracy
print("NB Train Accuracy: {:.2f}".format(nb_train_score*100))
print("NB Dev Accuracy: {:.2f}".format(nb_dev_score*100))
print("NB Test Accuracy: {:.2f}".format(nb_test_score*100))

NB Train Accuracy: 49.00
NB Dev Accuracy: 46.45
NB Test Accuracy: 45.09


In [0]:
# Baseline - LR Classifier

from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, train.region)

lr_train_score = lr_classifier.score(X_train, train.region)
lr_dev_score = lr_classifier.score(X_dev, dev.region)
lr_test_score = lr_classifier.score(X_test, test.region)

# Tweet level Accuracy
print("LR Train Accuracy: {:.2f}".format(lr_train_score*100))
print("LR Dev Accuracy: {:.2f}".format(lr_dev_score*100))
print("LR Test Accuracy: {:.2f}".format(lr_test_score*100))



LR Train Accuracy: 49.71
LR Dev Accuracy: 46.14
LR Test Accuracy: 44.82


In [0]:
# Baseline - SVM Classifier

from sklearn import svm
from sklearn.linear_model import SGDClassifier

# svm_classifier = svm.SVC(kernel='linear', verbose = 1)
svm_classifier = SGDClassifier(max_iter=1000, tol=1e-3, shuffle = True)
svm_classifier.fit(X_train, train.region)

svm_train_score = svm_classifier.score(X_train, train.region)
svm_dev_score = svm_classifier.score(X_dev, dev.region)
svm_test_score = svm_classifier.score(X_test, test.region)

# Tweet level Accuracy
print("SVM Train Accuracy: {:.2f}".format(svm_train_score*100))
print("SVM Dev Accuracy: {:.2f}".format(svm_dev_score*100))
print("SVM Test Accuracy: {:.2f}".format(svm_test_score*100))

SVM Train Accuracy: 47.78
SVM Dev Accuracy: 44.95
SVM Test Accuracy: 43.50


In [0]:
# Model Evaluation - User level Accuracy

from sklearn.metrics import accuracy_score

# Predict region for each tweet using a trained model
y_pred = svm_classifier.predict(X_test)
# print(y_pred.shape)

# Predict a region for each user
test['ypred'] = y_pred
user_pred = test.drop(['tweet','region'],axis=1)
pred = user_pred.groupby(['uid'])['ypred'].agg(lambda x:x.value_counts().index[0])
# print(pred.head(10))

# Compare predicted values with ground truth 
true = user_test.sort_values(by=['uid'])
# print(true.head(10))

final_acc = accuracy_score(true['region'], pred)
print("Final Accuracy: {:.2f}".format(final_acc*100))

Final Accuracy: 51.88




---

