In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv('C:/Users/user/Downloads/Datasets/twitter analysis/train.csv')
test_data = pd.read_csv('C:/Users/user/Downloads/Datasets/twitter analysis/test.csv')

In [3]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_data.shape

(7613, 5)

In [5]:
train_data.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
train_data['keyword'].fillna('', inplace=True)
train_data['location'].fillna('', inplace=True)

In [7]:
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7613 non-null   object
 2   location  7613 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [8]:
train_data.isna().sum()

id          0
keyword     0
location    0
text        0
target      0
dtype: int64

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
X = train_data.text
y = train_data.target

In [13]:
sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    # convert to lowercase 
    text = text.lower()
    
    text =  re.sub('[^a-zA-Z]', ' ', text)


    # tokenize using word_tokenize nltk & filter 1.stopword, 3.numbers,
    text = word_tokenize(text)
    text = [word for word in text if word not in sw]
    text = [word for word in text if not word.isdigit()]
    
    # stemmeing words 
    text = [lemmatizer.lemmatize(word) for word in text]
    
    return ' '.join(text)

In [14]:
train_data['text'] = train_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)


In [15]:
from sklearn.model_selection import train_test_split
train_data['tweet'] = train_data['text'] + ' ' + train_data['keyword'] + ' ' + train_data['location']
X = train_data['tweet']
y = train_data['target'] 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()


4996    courageous honest analysis need use atomic bom...
3263    zachzaidman thescore wld b shame golf cart bec...
4907    tell barackobama rescind medal honor given u s...
2855    worried ca drought might affect extreme weathe...
4716    youngheroesid lava blast amp power red panther...
Name: tweet, dtype: object

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb


In [17]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)), 
    ('classifier', LogisticRegression())
])

model.fit(X_train, y_train)
val_pred = model.predict(X_val)


In [18]:
from sklearn.metrics import confusion_matrix,accuracy_score
acc = accuracy_score(y_val,val_pred)
acc

0.7852921864740644

In [19]:
cm = confusion_matrix(y_val,val_pred)
cm

array([[743, 131],
       [196, 453]], dtype=int64)

In [20]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)), 
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

model.fit(X_train, y_train)
val_pred = model.predict(X_val)


In [21]:
acc = accuracy_score(y_val,val_pred)
acc

0.7787261982928431

In [22]:
cm = confusion_matrix(y_val,val_pred)
cm

array([[778,  96],
       [241, 408]], dtype=int64)

In [23]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)), 
    ('classifier', SVC(kernel='linear', C=1.0, probability=True))
])

model.fit(X_train, y_train)
val_pred = model.predict(X_val)


In [24]:
acc = accuracy_score(y_val,val_pred)
acc

0.788575180564675

In [25]:
cm = confusion_matrix(y_val,val_pred)
cm

array([[740, 134],
       [188, 461]], dtype=int64)

In [26]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)), 
    ('classifier', xgb.XGBClassifier(max_depth=6, n_estimators=100))
])

model.fit(X_train, y_train)
val_pred = model.predict(X_val)


In [27]:
acc = accuracy_score(y_val,val_pred)
acc

0.7859487852921865

In [28]:
cm = confusion_matrix(y_val,val_pred)
cm

array([[779,  95],
       [231, 418]], dtype=int64)