In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


#Requirements

In [0]:
!pip install catboost --quiet

[K     |████████████████████████████████| 64.8MB 54kB/s 
[?25h

#Imports

In [0]:
import warnings
warnings.simplefilter('ignore')

In [0]:
import os
import sys
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [0]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [0]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
from nltk.corpus import stopwords
from collections import Counter

In [0]:
import catboost as cat, lightgbm as lgb, xgboost as xgb

#Envs

In [0]:
seed = 2020 # for reproductibility
random.seed(seed)
np.random.seed(seed)

#Utilities

In [0]:
def process_prediction(preds):
  final_preds = []
  for pred in preds:
    argmax = pred.argmax()
    if argmax == 0: final_preds.append( -1*pred[0] )
    elif argmax == 1: final_preds.append( 0 )
    else: final_preds.append( pred[2] )
    
  return final_preds


def rmse(true, pred):
  return np.sqrt(mean_squared_error(true, pred))

#Reading the Data

In [0]:
path = 'drive/My Drive/Zindi/#ZindiWeekendz/'

In [0]:
train = pd.read_csv(path+'Train.csv')
test = pd.read_csv(path+'Test.csv')
sample = pd.read_csv(path+'SampleSubmission.csv')

#EDA

In [0]:
train.head(10)

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0
5,OVNPOAUX,<user> a nearly 67 year old study when mental ...,1.0,0.666667
6,JDA2QDV5,"Study of more than 95,000 kids finds no link b...",1.0,0.666667
7,S6UKR4OJ,psa: VACCINATE YOUR FUCKING KIDS,1.0,1.0
8,V6IJATBE,Coughing extra on the shuttle and everyone thi...,1.0,0.666667
9,VB25IDQK,AIDS vaccine created at Oregon Health &amp; Sc...,1.0,0.666667


In [0]:
test.head()

Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe..."
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though..."


In [0]:
train['label'].value_counts()

 0.000000    4908
 1.000000    4053
-1.000000    1038
 0.666667       1
Name: label, dtype: int64

In [0]:
train[train['label'].round(2)==0.67]

Unnamed: 0,tweet_id,safe_text,label,agreement
4799,I cannot believe in this day and age some pare...,1,0.666667,


In [0]:
train[train['label'].isna()]

Unnamed: 0,tweet_id,safe_text,label,agreement
4798,RQMQ0L2A,#lawandorderSVU,,


In [0]:
train[train['agreement'].isna()]

Unnamed: 0,tweet_id,safe_text,label,agreement
4798,RQMQ0L2A,#lawandorderSVU,,
4799,I cannot believe in this day and age some pare...,1,0.666667,


#Features Engeneering

##Part 1

In [0]:
train.drop(index=[4798, 4799], inplace=True)
train.reset_index(drop=True, inplace=True)

In [0]:
train['safe_text'] = train['safe_text'].apply(str)
test['safe_text'] = test['safe_text'].apply(str)

In [0]:
train['safe_text'] = train['safe_text'].apply(str.lower)
test['safe_text'] = test['safe_text'].apply(str.lower)

In [0]:
train['safe_text'] = train['safe_text'].apply(lambda x: x.replace('&amp;', ''))
test['safe_text'] = test['safe_text'].apply(lambda x: x.replace('&amp;', ''))

In [0]:
train['safe_text'] = train['safe_text'].apply(lambda x: x.strip().strip('.'))
test['safe_text'] = test['safe_text'].apply(lambda x: x.strip().strip('.'))

In [0]:
train.head(2)

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,me the big homie meanboy3000 #meanboy #mb #mb...,0.0,1.0
1,E3303EME,i'm 100% thinking of devoting my career to pro...,1.0,1.0


##Part 2

In [0]:
def clean_stop_words(df):
  stop_words = set(stopwords.words('english'))
  #df['text_clean'] = df['safe_text'].apply(lambda s: " ".join([w for w in s.lower().split() if w not in stop_words and not w.startswith('<')]))
  df['text_clean'] = df['safe_text'].apply(lambda s: " ".join([w for w in s.lower().split() if not w.startswith('<')]))
  df['text_clean'] = df['text_clean'].apply(lambda s: " ".join([w for w in s.lower().split() if not w.startswith('#')]))

In [0]:
df = pd.concat([train, test], axis = 0)

In [0]:
clean_stop_words(df)

In [0]:
train = df[~df.label.isnull()]
test = df[df.label.isnull()]

In [0]:
train['label'] += 1

In [0]:
xtrain = train['text_clean'].values
ytrain = train['label'].values

xtest = test['text_clean'].values

In [0]:
tvect = TfidfVectorizer(min_df=3, max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

cvect = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

In [0]:
tvect.fit(df['text_clean'].values.tolist())
cvect.fit(df['text_clean'].values.tolist())

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 3), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
                vocabulary=None)

In [0]:
xtrain_tv = tvect.transform(xtrain)
xtest_tv = tvect.transform(xtest)

In [0]:
xtrain_cv = cvect.transform(xtrain)
xtest_cv = cvect.transform(xtest)

#Training

In [0]:
n_fold = 10
fold = StratifiedKFold(n_fold, shuffle=True, random_state=seed)

In [0]:
avg = 0
test_oofs = []

for i, (tr,vr) in enumerate(fold.split(xtrain_tv, ytrain)):
  X,Y = xtrain_tv[tr], np.take(ytrain, tr, axis=0)
  x,y = xtrain_tv[vr], np.take(ytrain, vr, axis=0)

  model = LogisticRegression(C=1.1, max_iter=1000, random_state=seed, n_jobs=-1)
  model.fit(X, Y)
  test_pred = model.predict_proba(xtest_tv)
  pred = model.predict_proba(x)
  pred = process_prediction(pred)

  test_oofs.append(test_pred)

  score = rmse(y-1, pred)
  avg += score
  print(f"Fold {i}: ", score)

print("Avg score : {:.3f}".format(avg/n_fold))

Fold 0:  0.5841870946124796
Fold 1:  0.5856070381311078
Fold 2:  0.5974970150274683
Fold 3:  0.5878650191316905
Fold 4:  0.5757172734468895
Fold 5:  0.599551556072437
Fold 6:  0.6102478625051501
Fold 7:  0.5860529534500418
Fold 8:  0.6010971207474742
Fold 9:  0.6094025142666933
Avg score : 0.594


#Submission

In [0]:
final_pred = np.mean(test_oofs, axis=0)
final_pred = process_prediction(final_pred)

In [0]:
submission = test[['tweet_id']]
submission['target'] = final_pred
submission.columns = ['ID', 'target']

In [0]:
submission.head()

Unnamed: 0,ID,target
0,00BHHHP1,0.384746
1,00UNMD0E,0.496808
2,01AXPTJF,0.0
3,01HOEQJW,0.695313
4,01JUKMAO,0.0


In [0]:
submission.describe()

Unnamed: 0,target
count,5177.0
mean,0.288592
std,0.354426
min,-0.7031
25%,0.0
50%,0.0
75%,0.61857
max,0.984889


In [0]:
submission.to_csv(f'linear_regression_{avg/n_fold}.csv', index=False)