In [1]:
#Data
import pandas as pd
import numpy as np

#NLP
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

#Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#Modeling
from sklearn.svm import SVC
from sklearn.metrics import f1_score

In [2]:
train=pd.read_csv("C:\\Users\\admin\\Desktop\\DS Docs\\nlp-getting-started\\train.csv")
test=pd.read_csv("C:\\Users\\admin\\Desktop\\DS Docs\\nlp-getting-started\\test.csv")
subm=pd.read_csv("C:\\Users\\admin\\Desktop\\DS Docs\\nlp-getting-started\\sample_submission.csv")

In [3]:
train.shape

(7613, 5)

In [4]:
test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [5]:
target=train.target
test_id=test.id

train1=train.drop('target',axis=1)

data=pd.concat([train1,test],axis=0).reset_index(drop=True)

In [6]:
data

Unnamed: 0,id,keyword,location,text
0,1,,,Our Deeds are the Reason of this #earthquake M...
1,4,,,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...
...,...,...,...,...
10871,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
10872,10865,,,Storm in RI worse than last hurricane. My city...
10873,10868,,,Green Line derailment in Chicago http://t.co/U...
10874,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [7]:
def processEmail(contents):
    ps=PorterStemmer()
    
    contents=contents.lower()
    contents=re.sub(r'<[^<>]+>', ' ',contents)
    contents=re.sub(r'[0-9]+', 'number',contents)
    contents=re.sub(r'(http|https)://[^\s]*', 'httpaddr',contents)
    contents=re.sub(r'[^\s]+@[^\s]+', 'emailaddr',contents)
    contents=re.sub(r'[$]+', 'dollar',contents)
    
    words=word_tokenize(contents)
    
    for i in range(len(words)):
        words[i]=re.sub(r'[^a-zA-Z0-9]', '',words[i])
        words[i]=ps.stem(words[i])
        
    words=[word for word in words if len(word)>=1]
    
    return words

In [8]:
k="""We’re excited to announce that we’re co-sponsoring $500k in awards for outstanding papers and reviewers in the Reproducibility Challenge 2022.

Reproducibility is a major challenge in ML research. Reproducible results enable the scientific community to quickly verify empirical findings and incorporate new ideas. For the last 5 years, the ML Reproducibility Challenge has galvanized members of the ML community to work with authors to reproduce papers published at top conferences and to share their reports with the world.

This year, Kaggle will be awarding ~50 prizes in the form of GCP compute credits totaling $500k, including $25k each to the top 5 outstanding papers, subject to eligibility criteria.
"""

In [9]:
processEmail(k)

['we',
 're',
 'excit',
 'to',
 'announc',
 'that',
 'we',
 're',
 'cosponsor',
 'dollarnumberk',
 'in',
 'award',
 'for',
 'outstand',
 'paper',
 'and',
 'review',
 'in',
 'the',
 'reproduc',
 'challeng',
 'number',
 'reproduc',
 'is',
 'a',
 'major',
 'challeng',
 'in',
 'ml',
 'research',
 'reproduc',
 'result',
 'enabl',
 'the',
 'scientif',
 'commun',
 'to',
 'quickli',
 'verifi',
 'empir',
 'find',
 'and',
 'incorpor',
 'new',
 'idea',
 'for',
 'the',
 'last',
 'number',
 'year',
 'the',
 'ml',
 'reproduc',
 'challeng',
 'ha',
 'galvan',
 'member',
 'of',
 'the',
 'ml',
 'commun',
 'to',
 'work',
 'with',
 'author',
 'to',
 'reproduc',
 'paper',
 'publish',
 'at',
 'top',
 'confer',
 'and',
 'to',
 'share',
 'their',
 'report',
 'with',
 'the',
 'world',
 'thi',
 'year',
 'kaggl',
 'will',
 'be',
 'award',
 'number',
 'prize',
 'in',
 'the',
 'form',
 'of',
 'gcp',
 'comput',
 'credit',
 'total',
 'dollarnumberk',
 'includ',
 'dollarnumberk',
 'each',
 'to',
 'the',
 'top',
 'num

In [10]:
def getVocabulary(emails,vocab_length):
    vocabulary=dict()
    
    for i in range(len(emails)):
        emails[i]=processEmail(emails[i])
        for word in emails[i]:
            if word in vocabulary.keys():
                vocabulary[word]+=1
            else:
                vocabulary[word]=1
    vocabulary=sorted(vocabulary.items(),key=lambda x: x[1], reverse=True)
    vocabulary=list(map(lambda x: x[0],vocabulary[0:vocab_length]))
    vocabulary={index: word for index, word in enumerate(vocabulary)}
    
    return vocabulary

In [11]:
vocabulary=getVocabulary(data['text'],10000)
vocabulary


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emails[i]=processEmail(emails[i])


{0: 'httpaddr',
 1: 'the',
 2: 'a',
 3: 'to',
 4: 'in',
 5: 'number',
 6: 'of',
 7: 'i',
 8: 'and',
 9: 'is',
 10: 'you',
 11: 'for',
 12: 'it',
 13: 'on',
 14: 's',
 15: 'my',
 16: 'that',
 17: 'with',
 18: 'by',
 19: 'at',
 20: 'be',
 21: 'thi',
 22: 'nt',
 23: 'are',
 24: 'from',
 25: 'like',
 26: 'wa',
 27: 'have',
 28: 'fire',
 29: 'do',
 30: 'amp',
 31: 'as',
 32: 'your',
 33: 'up',
 34: 'just',
 35: 'get',
 36: 'not',
 37: 'me',
 38: 'we',
 39: 'but',
 40: 'so',
 41: 'out',
 42: 'no',
 43: 'm',
 44: 'will',
 45: 'all',
 46: 'ha',
 47: 'after',
 48: 'when',
 49: 'an',
 50: 'he',
 51: 'if',
 52: 'new',
 53: 'more',
 54: 'bomb',
 55: 'via',
 56: 'they',
 57: 'what',
 58: 'about',
 59: 'now',
 60: 'one',
 61: 'how',
 62: 'or',
 63: 'go',
 64: 'news',
 65: 'peopl',
 66: 'there',
 67: 'who',
 68: 'over',
 69: 'burn',
 70: 'can',
 71: 'kill',
 72: 'been',
 73: 'attack',
 74: 'us',
 75: 'into',
 76: 'video',
 77: 'emerg',
 78: 'flood',
 79: 'crash',
 80: 'time',
 81: 're',
 82: 'would',

In [12]:
def getKey(dictionary,val):
    for key,value in dictionary.items():
        if value==val:
            return key

In [13]:
def getIndices(email,vocabulary):
    word_indices=set()
    
    for word in email:
        if word in vocabulary.values():
            word_indices.add(getKey(vocabulary,word))
            
    return word_indices

In [14]:
def getFeatureVector(word_indices,vocab_length):
    feature_vec=np.zeros(vocab_length)
    
    for i in word_indices:
        feature_vec[i]=1
        
    return feature_vec

In [15]:
vocab_length=10000

In [16]:
emails=data['text']
emails=list(map(lambda x: (x),emails))

In [17]:
X=list(map(lambda x: getFeatureVector(getIndices(x,vocabulary),vocab_length),emails))
X=pd.DataFrame(np.array(X).astype(np.int16))

In [18]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10872,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10873,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10874,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
train_final=X.loc[:train1.index.max(),:]
test_final=X.loc[train1.index.max()+1:,:].reset_index(drop=True)

In [20]:
y=target

In [21]:
X_train,X_test,y_train,y_test=train_test_split(train_final,y,train_size=0.8,random_state=1)


In [22]:
svc=SVC()
svc.fit(X_train,y_train)

In [23]:
svc.score(X_test,y_test)

0.8063033486539725

In [24]:
y_pred=svc.predict(test_final)

In [25]:
test_id

0           0
1           2
2           3
3           9
4          11
        ...  
3258    10861
3259    10865
3260    10868
3261    10874
3262    10875
Name: id, Length: 3263, dtype: int64

In [26]:
y_pred

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

In [27]:
subm

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [28]:
submission=pd.concat([test_id,pd.Series(y_pred,name='target')],axis=1)
submission

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [29]:
submission.to_csv("C:\\Users\\admin\\Desktop\\GITHUB Projects\\KK\\Disaster Tweets\\Disaster_Submission.csv",index=False,header=True)

In [30]:
submission.shape

(3263, 2)