# MDSS Datathon - Team ADS
#### Johnny Huang, Jonothan Kerslake, Patrick Chen

# Preface

The following notebook parses precleaned twitter data scraped with the Tweepy library. The aim is to:
1. Clean both training and testing data
2. Transform text into suitable data as an input to a machine learning model
3. Output labels from model into csv and submit to kaggle

# Import libraries

In [1]:
import pandas as pd
import re
import nltk
import string
import numpy as np

In [2]:
# Training dataframe
df = pd.read_csv("mdss-basic-stream/train_basic.csv")
# Testing dataframe
df2 = pd.read_csv("mdss-basic-stream/test_basic.csv")

# List of unique labels/tags
tags = list(df['label'].unique())

In [22]:
df['tweet'][3]

'rt errolwebber: tell me, would this be considered "racist?\\xe2\\x80\\x9d\\n\\ndo people who support blm think this is ok? https://t.co/duwabljghp\''

In [4]:
# Referenced from https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python

# Strip HTML tags and other characters, such as &amp
from io import StringIO
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

# Data Cleaning 

In [5]:
# Drop NaN
df.dropna(inplace=True)

# Drop Duplicates
df.drop_duplicates(subset=['tweet'])

# Strip HTML
df['tweet'] = df['tweet'].apply(lambda x: strip_tags(x))

#Removes Punctuation
df['tweet'] = df['tweet'].apply(lambda x: re.sub('[!@#$:).;,?]', '', str(x)))
#Removes hyperlinks
df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'http\S+', '', str(x)))
#Remove newlines
df['tweet'] = df['tweet'].apply(lambda x: x.strip().replace('\\n', ''))
#Remove encoded characters
df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'(\\x(.){2})', '', str(x)))
#Removes ' , ", and blank space
df['tweet'] = df['tweet'].apply(lambda x: x.replace("'", ""))
df['tweet'] = df['tweet'].apply(lambda x: x.replace('"', ""))
df['tweet'] = df['tweet'].apply(lambda x: x.strip())
#Remove escape charcters but keep emoji codes
df['tweet'] = df['tweet'].apply(lambda x: re.sub("\\\\", '', str(x)))
# Remove 'rt', retweet string. Remove newline character
df['tweet'] = df['tweet'].apply(lambda x: x.strip("rt ").replace('\n', ''))

# Remove other punctuation 
exclude = set(string.punctuation)
df['tweet'] = df['tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['tweet'] = df['tweet'].apply(lambda x: re.sub('[”’“‘1234567890]', '', str(x)))

In [6]:
df

Unnamed: 0,tweet,label
0,silencing blm priti patels antiprotest law i...,BLM
1,illian johnbok nadiawhittomemp silly little wo...,BLM
2,errolwebber tell me would this be considered r...,BLM
3,apple wont let parler have an app but still ke...,BLM
4,malikaandrews wojespn can we get jlm trending ...,BLM
...,...,...
26395,hey have brainwashed their base for generation...,Riots
26396,blackcat these neanderthals were there for blm...,Riots
26397,atrupar dear jaketapper instead of airing tat...,Riots
26398,his is a bombshellmarjorie greene may have bee...,Riots


### Tokenize testing data
This will separate each string into a list of words

In [7]:
from nltk import word_tokenize

In [45]:
df['token'] = df['tweet'].apply(word_tokenize)
df['token'][0]

['silencing',
 'blm',
 'priti',
 'patels',
 'antiprotest',
 'law',
 'iandunt',
 'on',
 'how',
 'a',
 'government',
 'keen',
 'to',
 'tackle',
 'cancel',
 'culture',
 'when']

### Remove stopwords
This will remove all conjuction words from NLTK's library of stopwords

In [10]:
from nltk.corpus import stopwords
sw = stopwords.words('english')
df['token'] = df['token'].apply(lambda x:[word for word in x if word not in sw])
df['token'][0]

['silencing',
 'blm',
 'priti',
 'patels',
 'antiprotest',
 'law',
 'iandunt',
 'government',
 'keen',
 'tackle',
 'cancel',
 'culture']

### Create Stem Words from Token 
Words will revert to their root/stem word. 
Thus, variances of words such as big / bigger / biggest will collate to the same root word.

In [12]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
df['stem'] =  df['token'].apply(lambda x: [stemmer.stem(i) for i in x])
df['stem'][0]

['silenc',
 'blm',
 'priti',
 'patel',
 'antiprotest',
 'law',
 'iandunt',
 'govern',
 'keen',
 'tackl',
 'cancel',
 'cultur']

### Word Frequency

Find the word frequency to double check data cleaning process

In [14]:
#Dictionary for token
wordfreq = {}
for token in df['stem']:
    for i in range(len(token)):
        if token[i] not in wordfreq.keys():
            wordfreq[token[i]] = 1
        else:
            wordfreq[token[i]] += 1
            
sorted_stem = sorted(wordfreq.items(), key=lambda kv: kv[1])
sorted_stem.reverse()
sorted_stem

[('covid', 4788),
 ('trump', 4537),
 ('biden', 3032),
 ('joebiden', 1997),
 ('vaccin', 1738),
 ('blm', 1536),
 ('peopl', 1411),
 ('get', 1363),
 ('wearamask', 1327),
 ('capitolriot', 1309),
 ('vote', 1290),
 ('us', 1102),
 ('like', 998),
 ('mask', 956),
 ('know', 956),
 ('one', 916),
 ('dont', 914),
 ('black', 911),
 ('new', 890),
 ('blacklivesmatt', 889),
 ('say', 858),
 ('elect', 842),
 ('pleas', 812),
 ('go', 783),
 ('time', 755),
 ('need', 689),
 ('want', 680),
 ('year', 680),
 ('day', 658),
 ('watch', 616),
 ('make', 592),
 ('capitol', 573),
 ('still', 568),
 ('see', 562),
 ('support', 562),
 ('hunterbiden', 561),
 ('live', 551),
 ('im', 549),
 ('case', 545),
 ('presid', 544),
 ('let', 540),
 ('democrat', 535),
 ('would', 531),
 ('donaldtrump', 527),
 ('twitter', 527),
 ('help', 526),
 ('polic', 517),
 ('state', 516),
 ('american', 516),
 ('fbi', 506),
 ('today', 503),
 ('think', 500),
 ('call', 485),
 ('gop', 484),
 ('via', 476),
 ('never', 463),
 ('bidenharri', 462),
 ('use', 45

In [15]:
#Dictionary for token
wordfreq = {}
for token in df['token']:
    for i in range(len(token)):
        if token[i] not in wordfreq.keys():
            wordfreq[token[i]] = 1
        else:
            wordfreq[token[i]] += 1
            
sorted_token = sorted(wordfreq.items(), key=lambda kv: kv[1])
sorted_token.reverse()
sorted_token

[('covid', 4781),
 ('trump', 4242),
 ('biden', 2810),
 ('joebiden', 1939),
 ('blm', 1536),
 ('people', 1336),
 ('wearamask', 1327),
 ('capitolriots', 1290),
 ('us', 1102),
 ('get', 992),
 ('vaccine', 970),
 ('dont', 914),
 ('vote', 902),
 ('new', 890),
 ('like', 881),
 ('black', 864),
 ('know', 854),
 ('one', 840),
 ('please', 802),
 ('election', 699),
 ('mask', 689),
 ('time', 635),
 ('blacklivesmatter', 631),
 ('capitol', 571),
 ('still', 568),
 ('hunterbiden', 552),
 ('im', 549),
 ('would', 531),
 ('twitter', 520),
 ('donaldtrump', 517),
 ('fbi', 506),
 ('see', 494),
 ('police', 492),
 ('need', 491),
 ('president', 484),
 ('via', 476),
 ('gop', 476),
 ('day', 468),
 ('never', 463),
 ('bidenharris', 462),
 ('want', 439),
 ('today', 438),
 ('watch', 436),
 ('white', 433),
 ('seditionhunters', 418),
 ('realdonaldtrump', 418),
 ('good', 414),
 ('cases', 413),
 ('many', 413),
 ('station', 411),
 ('america', 408),
 ('think', 398),
 ('make', 397),
 ('said', 395),
 ('ump', 394),
 ('even', 3

# Modelling

In [16]:
df['stem_full'] = df['stem'].apply(lambda x: ' '.join(x))
df['token_full'] = df['token'].apply(lambda x: ' '.join(x))
df

Unnamed: 0,tweet,label,token,stem,stem_full,token_full
0,silencing blm priti patels antiprotest law i...,BLM,"[silencing, blm, priti, patels, antiprotest, l...","[silenc, blm, priti, patel, antiprotest, law, ...",silenc blm priti patel antiprotest law iandunt...,silencing blm priti patels antiprotest law ian...
1,illian johnbok nadiawhittomemp silly little wo...,BLM,"[illian, johnbok, nadiawhittomemp, silly, litt...","[illian, johnbok, nadiawhittomemp, silli, litt...",illian johnbok nadiawhittomemp silli littl wok...,illian johnbok nadiawhittomemp silly little wo...
2,errolwebber tell me would this be considered r...,BLM,"[errolwebber, tell, would, considered, racistd...","[errolwebb, tell, would, consid, racistdo, peo...",errolwebb tell would consid racistdo peopl sup...,errolwebber tell would considered racistdo peo...
3,apple wont let parler have an app but still ke...,BLM,"[apple, wont, let, parler, app, still, keeps, ...","[appl, wont, let, parler, app, still, keep, tw...",appl wont let parler app still keep twitter al...,apple wont let parler app still keeps twitter ...
4,malikaandrews wojespn can we get jlm trending ...,BLM,"[malikaandrews, wojespn, get, jlm, trending, m...","[malikaandrew, wojespn, get, jlm, trend, mayb,...",malikaandrew wojespn get jlm trend mayb nba pu...,malikaandrews wojespn get jlm trending maybe n...
...,...,...,...,...,...,...
26395,hey have brainwashed their base for generation...,Riots,"[hey, brainwashed, base, generations, im, stil...","[hey, brainwash, base, generat, im, still, pis...",hey brainwash base generat im still piss trick...,hey brainwashed base generations im still piss...
26396,blackcat these neanderthals were there for blm...,Riots,"[blackcat, neanderthals, blm, blacklivesmatter...","[blackcat, neanderth, blm, blacklivesmatt, cap...",blackcat neanderth blm blacklivesmatt capitolriot,blackcat neanderthals blm blacklivesmatter cap...
26397,atrupar dear jaketapper instead of airing tat...,Riots,"[atrupar, dear, jaketapper, instead, airing, t...","[atrupar, dear, jaketapp, instead, air, tatere...",atrupar dear jaketapp instead air tatereev tru...,atrupar dear jaketapper instead airing tateree...
26398,his is a bombshellmarjorie greene may have bee...,Riots,"[bombshellmarjorie, greene, may, genius, behin...","[bombshellmarjori, green, may, genius, behind,...",bombshellmarjori green may genius behind storm...,bombshellmarjorie greene may genius behind sto...


Create a new column, concatenating all stemmed tweets

In [17]:
df['stem_full'] = df['stem'].apply(lambda x: ' '.join(x))

In [46]:
# y = List of associated labels to each tweet
y = [i for i in df['label']]

In [19]:
corpus = [i for i in df['stem_full']]
corpus_tok = [i for i in df['token_full']]

### Internal performance check

#### Use `train_test_split` with training dataset to check performance of models

In [33]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

# Machine Learning; Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [34]:
# We can write less code and run all the libraries in generating a model, by building a pipeline as follows:
from sklearn.pipeline import Pipeline

In [23]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.33, random_state=42)

clf_stem = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
clf_stem = clf_stem.fit(X_train, y_train)

y_pred = clf_stem.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=tags))

accuracy 0.7797982314111347
              precision    recall  f1-score   support

         BLM       0.91      0.67      0.77      1631
       Trump       0.79      0.68      0.73      1080
       Biden       0.70      0.99      0.82      2851
       Covid       0.95      0.76      0.84      1376
       Riots       0.74      0.52      0.61      1091

    accuracy                           0.78      8029
   macro avg       0.82      0.72      0.76      8029
weighted avg       0.80      0.78      0.77      8029



In [24]:
X_train_tok, X_test_tok, y_train_tok, y_test_tok = train_test_split(corpus_tok, y, test_size=0.33, random_state=42)

clf_tok = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
clf_tok = clf_stem.fit(X_train_tok, y_train_tok)

y_pred_tok = clf_stem.predict(X_test_tok)

print('accuracy %s' % accuracy_score(y_pred_tok, y_test_tok))
print(classification_report(y_test_tok, y_pred_tok,target_names=tags))

accuracy 0.7865238510399801
              precision    recall  f1-score   support

         BLM       0.92      0.67      0.78      1631
       Trump       0.79      0.70      0.74      1080
       Biden       0.71      0.99      0.83      2851
       Covid       0.95      0.78      0.86      1376
       Riots       0.74      0.53      0.62      1091

    accuracy                           0.79      8029
   macro avg       0.82      0.73      0.76      8029
weighted avg       0.81      0.79      0.78      8029



# Clean testing data

Remember testing dataframe was set as `df2`

In [25]:
# Remove duplicates, other languages, remove punctuation again

# Drop NaN
df2.dropna(inplace=True)

# Drop Duplicates
df2.drop_duplicates(subset=['tweet'])

# Strip HTML
df2['tweet'] = df2['tweet'].apply(lambda x: strip_tags(x))

#Removes Punctuation
df2['tweet'] = df2['tweet'].apply(lambda x: re.sub('[!@#$:).;,?&]', '', str(x)))
#Removes hyperlinks
df2['tweet'] = df2['tweet'].apply(lambda x: re.sub(r'http\S+', '', str(x)))
#Remove newlines
df2['tweet'] = df2['tweet'].apply(lambda x: x.strip().replace('\\n', ''))
#Remove encoded characters
df2['tweet'] = df2['tweet'].apply(lambda x: re.sub(r'(\\x(.){2})', '', str(x)))
#Removes ' , ", and blank space
df2['tweet'] = df2['tweet'].apply(lambda x: x.replace("'", ""))
df2['tweet'] = df2['tweet'].apply(lambda x: x.replace('"', ""))
df2['tweet'] = df2['tweet'].apply(lambda x: x.strip())
#Remove escape charcters but keep emoji codes
df2['tweet'] = df2['tweet'].apply(lambda x: re.sub("\\\\", '', str(x)))
# Remove 'rt', retweet string. Remove newline character
df2['tweet'] = df2['tweet'].apply(lambda x: x.strip("rt ").replace('\n', ''))

# Remove other punctuation 
df2['tweet'] = df2['tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df2['tweet'] = df2['tweet'].apply(lambda x: re.sub('[”’“1234567890]', '', str(x)))


### Tokenize words

In [26]:
df2['token'] = df2['tweet'].apply(word_tokenize)

### Remove Stop Words

In [27]:
df2['token'] = df2['token'].apply(lambda x:[word for word in x if word not in sw])

### Create Stem Words from Token 

In [28]:
df2['stem'] =  df2['token'].apply(lambda x: [stemmer.stem(i) for i in x])

### Word Frequency

In [29]:
#Dictionary for token
wordfreq2 = {}
for token in df2['stem']:
    for i in range(len(token)):
        if token[i] not in wordfreq2.keys():
            wordfreq2[token[i]] = 1
        else:
            wordfreq2[token[i]] += 1
            
sorted_stem2 = sorted(wordfreq2.items(), key=lambda kv: kv[1])
sorted_stem2.reverse()
sorted_stem2

[('trump', 1411),
 ('biden', 993),
 ('covid', 727),
 ('wearamask', 637),
 ('joebiden', 452),
 ('blacklivesmatt', 426),
 ('mask', 355),
 ('vote', 302),
 ('die', 286),
 ('get', 286),
 ('capitolriot', 281),
 ('peopl', 277),
 ('like', 256),
 ('black', 255),
 ('us', 239),
 ('elect', 232),
 ('one', 231),
 ('station', 225),
 ('dont', 222),
 ('day', 208),
 ('de', 205),
 ('ctzebra', 204),
 ('donaldtrump', 196),
 ('yo', 196),
 ('say', 189),
 ('vaccin', 188),
 ('new', 185),
 ('wear', 175),
 ('know', 174),
 ('usa', 172),
 ('twitter', 167),
 ('time', 166),
 ('go', 161),
 ('pleas', 157),
 ('still', 151),
 ('support', 147),
 ('want', 139),
 ('american', 135),
 ('today', 134),
 ('coronavirus', 133),
 ('ump', 132),
 ('would', 127),
 ('make', 126),
 ('capitol', 126),
 ('see', 125),
 ('presid', 123),
 ('im', 123),
 ('live', 121),
 ('need', 120),
 ('white', 120),
 ('depart', 117),
 ('la', 117),
 ('look', 116),
 ('realdonaldtrump', 115),
 ('news', 114),
 ('state', 114),
 ('democrat', 113),
 ('america', 112

In [30]:
#Dictionary for token
wordfreq2_tok = {}
for token in df2['token']:
    for i in range(len(token)):
        if token[i] not in wordfreq2_tok.keys():
            wordfreq2_tok[token[i]] = 1
        else:
            wordfreq2_tok[token[i]] += 1
            
sorted_token = sorted(wordfreq2_tok.items(), key=lambda kv: kv[1])
sorted_token.reverse()
sorted_token

[('trump', 1348),
 ('biden', 949),
 ('covid', 725),
 ('wearamask', 637),
 ('joebiden', 439),
 ('blacklivesmatter', 283),
 ('capitolriots', 279),
 ('people', 271),
 ('mask', 250),
 ('black', 242),
 ('us', 239),
 ('like', 237),
 ('station', 222),
 ('dont', 222),
 ('one', 213),
 ('died', 211),
 ('de', 205),
 ('vote', 205),
 ('ctzebra', 204),
 ('get', 203),
 ('yo', 196),
 ('donaldtrump', 195),
 ('new', 185),
 ('election', 180),
 ('usa', 172),
 ('twitter', 165),
 ('day', 154),
 ('please', 153),
 ('still', 151),
 ('know', 148),
 ('blacklivesmatte', 143),
 ('coronavirus', 133),
 ('time', 131),
 ('ump', 127),
 ('would', 127),
 ('today', 127),
 ('capitol', 126),
 ('im', 122),
 ('white', 118),
 ('la', 117),
 ('realdonaldtrump', 115),
 ('news', 114),
 ('president', 112),
 ('see', 112),
 ('bidenharris', 110),
 ('maga', 110),
 ('gop', 108),
 ('facebook', 106),
 ('police', 106),
 ('america', 106),
 ('town', 105),
 ('wear', 104),
 ('joe', 104),
 ('never', 104),
 ('good', 103),
 ('departed', 99),
 ('w

Create a new column, concatenating all words that are tokenized/stemmed into a single string

In [31]:
df2['stem_full'] = df2['stem'].apply(lambda x: ' '.join(x))
df2['token_full'] = df2['token'].apply(lambda x: ' '.join(x))
df2

Unnamed: 0,Train_id,tweet,token,stem,stem_full,token_full
0,1,congratulations rrhdr and publichealthumn so h...,"[congratulations, rrhdr, publichealthumn, happ...","[congratul, rrhdr, publichealthumn, happi, see...",congratul rrhdr publichealthumn happi see mn s...,congratulations rrhdr publichealthumn happy se...
1,2,he same people who perpetrated the whitesuprem...,"[people, perpetrated, whitesupremacy, domestic...","[peopl, perpetr, whitesupremaci, domesticterro...",peopl perpetr whitesupremaci domesticterrorist...,people perpetrated whitesupremacy domesticterr...
2,3,fannie lou hamerblackhistoryblackhistorymonthb...,"[fannie, lou, hamerblackhistoryblackhistorymon...","[fanni, lou, hamerblackhistoryblackhistorymont...",fanni lou hamerblackhistoryblackhistorymonthbl...,fannie lou hamerblackhistoryblackhistorymonthb...
3,4,kylandyoung williamcson million properly dist...,"[kylandyoung, williamcson, million, properly, ...","[kylandyoung, williamcson, million, proper, di...",kylandyoung williamcson million proper distrib...,kylandyoung williamcson million properly distr...
4,5,its the colors for the month for meblacklivesm...,"[colors, month, meblacklivesmatterthats, thats...","[color, month, meblacklivesmatterthat, that, p...",color month meblacklivesmatterthat that post b...,colors month meblacklivesmatterthats thats pos...
...,...,...,...,...,...,...
5959,5960,members of the us capitol police force nati...,"[members, us, capitol, police, force, national...","[member, us, capitol, polic, forc, nation, gua...",member us capitol polic forc nation guard memb...,members us capitol police force national guard...
5960,5961,excuses dont excuse the republicans have excu...,"[excuses, dont, excuse, republicans, excuses, ...","[excus, dont, excus, republican, excus, should...",excus dont excus republican excus shouldnt imp...,excuses dont excuse republicans excuses should...
5961,5962,here were hundreds of crimes committed on jan ...,"[hundreds, crimes, committed, jan, capitol, hu...","[hundr, crime, commit, jan, capitol, hundr, do...",hundr crime commit jan capitol hundr dont make...,hundreds crimes committed jan capitol hundreds...
5962,5963,simonwghost im creating ingoditrustrioter to k...,"[simonwghost, im, creating, ingoditrustrioter,...","[simonwghost, im, creat, ingoditrustriot, keep...",simonwghost im creat ingoditrustriot keep tab ...,simonwghost im creating ingoditrustrioter keep...


# Training a model
## Start with Naive Bayes  (Scored 0.734)

In [35]:
# corpus = X_train
# y = y_train
# corpus2 = x_test
# y_pred = predicted y labels


clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
clf = clf.fit(corpus, y)

corpus2 = df2['stem_full']
y_pred = clf.predict(corpus2)

In [36]:
df_nb = pd.read_csv("mdss-basic-stream/test_basic.csv")
df_nb['label'] = y_pred

#### To CSV

In [37]:
df_nb.drop(['tweet'],axis=1,inplace=True)

In [38]:
df_nb.to_csv("submission_nb_new.csv", index=False)

# SVM with stemmed words (Scored 0.83)

In [39]:
from sklearn.linear_model import SGDClassifier

# corpus = X_train
# y = y_train
# corpus2 = x_test
# y_pred = predicted y labels

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(corpus, y)

corpus2 = df2['stem_full']
y_pred_svm = clf.predict(corpus2)

df_svm = pd.read_csv("mdss-basic-stream/test_basic.csv")
df_svm['label'] = y_pred_svm

#### To CSV

In [40]:
df_svm.drop(['tweet'],axis=1,inplace=True)

In [41]:
df_svm.to_csv("submission_svm_new.csv", index=False)

# SVM with tokenized (not stemmed) words (Scored 0.83)

In [42]:
# SVM
sgd_tok = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd_tok.fit(corpus_tok, y)

corpus_tok_svm = df2['stem_full']
y_pred_tok_svm = sgd_tok.predict(corpus_tok_svm)

df_svm_tok = pd.read_csv("mdss-basic-stream/test_basic.csv")
df_svm_tok['label'] = y_pred_tok_svm

#### To CSV

In [43]:
df_svm_tok.drop(['tweet'],axis=1,inplace=True)

In [44]:
df_svm_tok.to_csv("submission_svm_tokenised_new.csv", index=False)