# Classify tweets by predicting country of origin using feature extraction from text

In [114]:
import pandas as pd
import re
import os
import cytoolz as ct
from gensim.parsing import preprocessing
import numpy as np
import random

# Load and explore dataset

In [23]:
file = os.path.join("srv", "data", "shared_data_folder", "data", "Twitter_by_Country.gz")
df  = pd.read_csv(file)

In [24]:
df

Unnamed: 0.1,Unnamed: 0,Country,Text
0,0,Australia,this long thread on medieval dyes and pigments...
1,1,Australia,no i didnt just bust my ass to she in the dark...
2,2,Australia,give him a beak he s just speaking in tongues ...
3,3,Australia,great movie great ending to years of my lifeun...
4,4,Australia,happy holidays we wish you all the best for th...
...,...,...,...
1140243,1140243,United_States,you also decided that i didn t read the articl...
1140244,1140244,United_States,great info do you know what to do participate ...
1140245,1140245,United_States,my yo gave me a bad yelp review so i put him u...
1140246,1140246,United_States,this is real smart city stuff not fake news sm...


In [26]:
df = df[['Country', 'Text']]

In [27]:
df

Unnamed: 0,Country,Text
0,Australia,this long thread on medieval dyes and pigments...
1,Australia,no i didnt just bust my ass to she in the dark...
2,Australia,give him a beak he s just speaking in tongues ...
3,Australia,great movie great ending to years of my lifeun...
4,Australia,happy holidays we wish you all the best for th...
...,...,...
1140243,United_States,you also decided that i didn t read the articl...
1140244,United_States,great info do you know what to do participate ...
1140245,United_States,my yo gave me a bad yelp review so i put him u...
1140246,United_States,this is real smart city stuff not fake news sm...


In [63]:
print('Total number of tweets: ',len(df))
print('number of tweets per country:')
for country, df_country in df.groupby('Country'):
    print(f'{country:{25}} {len(df_country)}')

Total number of tweets:  1140248
number of tweets per country:
Australia                 192307
Ireland                   192307
New_Zealand               178713
South_Africa              192307
United_Kingdom            192307
United_States             192307


# Obtain train and test set

In [83]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(df.Text, df.Country, test_size=0.1 )

In [92]:
type(xtrain)

pandas.core.series.Series

In [110]:
xtrain

29421      going to the movies by myself is my preference...
751212     when it comes to perfomance you will not exper...
839886     thank you for sharing our post have a great da...
167733     insert whatever this is epic pepe fuentes styl...
380807     plaza de las flores a square lined with floris...
                                 ...                        
906105     i seem to have missed a lot of madness todayto...
1055068    if the guy can throw anything like sales slide...
216052     ahhhhb i would say thankyou again also thx for...
696442     more spiritual cause i would sleep after jumma...
953238     famous dex speaks on juice wrld s passing atoh...
Name: Text, Length: 1026223, dtype: object

# Preprocess dataset for features extraction

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [98]:
phrases=None
def clean(line, tag = False):
    #Remove links, hashtags, at-mentions, mark-up, and "RT"
    line = re.sub(r"http\S+", "", line)
    line = re.sub(r"@\S+", "", line)
    line = re.sub(r"#\S+", "", line)
    line = re.sub("<[^>]*>", "", line)
    line = line.replace(" RT", "").replace("RT ", "")

    #Remove punctuation and extra spaces
    line = ct.pipe(line, preprocessing.strip_tags, 
                   preprocessing.strip_punctuation, 
                   preprocessing.strip_numeric, 
                   preprocessing.strip_non_alphanum, 
                   preprocessing.strip_multiple_whitespaces)

    #Strip and lowercase
    line = line.lower().strip().lstrip().split()

    #If we've used PMI to find phrases, get those phrases now
    if phrases != None:
        line = list(phrases[line])

    #If we want Part-of-Speech tagging, do that now
    if tag == True:
        line = nlp(" ".join(line))
        line = [w.text + "_" + w.pos_ for w in line]
        
    line = ' '.join([w for w in line])

    return line

In [99]:
df.iloc[0].Text

'this long thread on medieval dyes and pigments is one of the things i love about twitter an expert sharing her knoa couple of years ago we did a quiz so if you have some spare time why not see how much you know about theharry styles fine line cd giveaway rt and reply with your top songs from fine line a picture of harry mbf illa glimmer of hope for the last wild population the cape arid np bushfire while still uncontrolled aa tiny step forward in the battle to save the last wild population cape arid fire doesn t seem to haafter years maybe finally naka read ko usab sa church and it feels so good thank you lord happy birthdaymary godwin writes in her journal in the evening hogg comes i like him better each time it is a pity thaenjoying my extra cold castle lite dankolow key true rocky stopped making decent music after yams diedcrabb robinson a call on blake i read him wordsworth s incomparable ode which he heartily enjoyedsevern writes the continued stretch of keats imagination has kil

In [100]:
clean(df.iloc[0].Text)

'this long thread on medieval dyes and pigments is one of the things i love about twitter an expert sharing her knoa couple of years ago we did a quiz so if you have some spare time why not see how much you know about theharry styles fine line cd giveaway rt and reply with your top songs from fine line a picture of harry mbf illa glimmer of hope for the last wild population the cape arid np bushfire while still uncontrolled aa tiny step forward in the battle to save the last wild population cape arid fire doesn t seem to haafter years maybe finally naka read ko usab sa church and it feels so good thank you lord happy birthdaymary godwin writes in her journal in the evening hogg comes i like him better each time it is a pity thaenjoying my extra cold castle lite dankolow key true rocky stopped making decent music after yams diedcrabb robinson a call on blake i read him wordsworth s incomparable ode which he heartily enjoyedsevern writes the continued stretch of keats imagination has kil

In [101]:
vectorizer = TfidfVectorizer(input = "content", encoding = "utf-8", decode_error = "replace", ngram_range = (1, 1), norm = "l2", use_idf = True, smooth_idf = True, preprocessor= clean, tokenizer = None)

In [103]:
vectorizer.fit(xtrain)

TfidfVectorizer(decode_error='replace',
                preprocessor=<function clean at 0x000001B1F82A5828>)

In [111]:
print('Vocabulary size: ', len(vectorizer.vocabulary_) )

Vocabulary size:  10635190


In [105]:
x_train = vectorizer.transform(xtrain)
x_test = vectorizer.transform(xtest)

In [113]:
print('shape of matrix vectors on train set:', x_train.shape)

shape of matrix vectors on train set: (1026223, 10635190)


# Train model

In [106]:
from sklearn.svm import LinearSVC
model = LinearSVC(penalty = "l2", loss = "squared_hinge", dual = True, tol = 0.0001, C = 1.0, multi_class = "ovr", fit_intercept = True, intercept_scaling = 1, max_iter = 200000)

In [107]:
model.fit(x_train, ytrain)

LinearSVC(max_iter=200000)

# Evaluate model

In [108]:
from sklearn.metrics import classification_report

In [109]:
predictions = model.predict(x_test)
print(classification_report(ytest, predictions))

                precision    recall  f1-score   support

     Australia       0.97      0.95      0.96     19215
       Ireland       0.98      0.97      0.97     19634
   New_Zealand       0.97      0.94      0.96     17696
  South_Africa       0.99      0.98      0.98     19032
United_Kingdom       0.94      0.97      0.95     19174
 United_States       0.95      0.99      0.97     19274

      accuracy                           0.96    114025
     macro avg       0.97      0.96      0.96    114025
  weighted avg       0.97      0.96      0.96    114025



# Sample prediction

In [115]:
index = random.sample(list(xtest.index), k=1)[0]
print(f'====================== Prediction for tweet n°{index} ===============================')
print(xtest[index])
print()
print(f'True label --------> {ytest[index]}')
print(f'Predicted label ---> {model.predict(vectorizer.transform([xtest[index]]))[0]}')

what reaction from his teammates i wondercracks me upfunny thought provoking strange intelligent and hilarious thanks amp brilliant showi hear kevin and stuart went professional afteri can t find the exact one i bought but these are the same i think you ll like d poover people banded together to recreate one of ireland s most famous mythological feats viathanks yvette i think everyone got stuck in to givewhen i hear euphoria i m fine epiphany i cry my eyes outin fairness billy don t we don t we deserve a govt that is honest productive free of corruption and representathe weirdest thing about was that facebook was still working but other apps internet wouldn t load at allits been days since the conceand i still cry when i hear a song see a concephoto i m legithappy birthday jiminnie you looked like an actual king at the concert getwhat it must be like playing upfront for irelandand in some countries men are not allowed to work at alleveryone hates med and pre med students for being cutt