In [50]:
import pandas as pd
import re

In [52]:
train=pd.read_csv("train.csv")
train.shape

(31962, 3)

In [54]:
train.head

<bound method NDFrame.head of           id  label                                              tweet
0          1      0   @user when a father is dysfunctional and is s...
1          2      0  @user @user thanks for #lyft credit i can't us...
2          3      0                                bihday your majesty
3          4      0  #model   i love u take with u all the time in ...
4          5      0             factsguide: society now    #motivation
...      ...    ...                                                ...
31957  31958      0  ate @user isz that youuu?ðððððð...
31958  31959      0    to see nina turner on the airwaves trying to...
31959  31960      0  listening to sad songs on a monday morning otw...
31960  31961      1  @user #sikh #temple vandalised in in #calgary,...
31961  31962      0                   thank you @user for you follow  

[31962 rows x 3 columns]>

In [56]:
train.info

<bound method DataFrame.info of           id  label                                              tweet
0          1      0   @user when a father is dysfunctional and is s...
1          2      0  @user @user thanks for #lyft credit i can't us...
2          3      0                                bihday your majesty
3          4      0  #model   i love u take with u all the time in ...
4          5      0             factsguide: society now    #motivation
...      ...    ...                                                ...
31957  31958      0  ate @user isz that youuu?ðððððð...
31958  31959      0    to see nina turner on the airwaves trying to...
31959  31960      0  listening to sad songs on a monday morning otw...
31960  31961      1  @user #sikh #temple vandalised in in #calgary,...
31961  31962      0                   thank you @user for you follow  

[31962 rows x 3 columns]>

In [58]:
test=pd.read_csv("test.csv")
test.shape


(17197, 2)

In [60]:
test.head

<bound method NDFrame.head of           id                                              tweet
0      31963  #studiolife #aislife #requires #passion #dedic...
1      31964   @user #white #supremacists want everyone to s...
2      31965  safe ways to heal your #acne!!    #altwaystohe...
3      31966  is the hp and the cursed child book up for res...
4      31967    3rd #bihday to my amazing, hilarious #nephew...
...      ...                                                ...
17192  49155  thought factory: left-right polarisation! #tru...
17193  49156  feeling like a mermaid ð #hairflip #neverre...
17194  49157  #hillary #campaigned today in #ohio((omg)) &am...
17195  49158  happy, at work conference: right mindset leads...
17196  49159  my   song "so glad" free download!  #shoegaze ...

[17197 rows x 2 columns]>

In [62]:
train.info

<bound method DataFrame.info of           id  label                                              tweet
0          1      0   @user when a father is dysfunctional and is s...
1          2      0  @user @user thanks for #lyft credit i can't us...
2          3      0                                bihday your majesty
3          4      0  #model   i love u take with u all the time in ...
4          5      0             factsguide: society now    #motivation
...      ...    ...                                                ...
31957  31958      0  ate @user isz that youuu?ðððððð...
31958  31959      0    to see nina turner on the airwaves trying to...
31959  31960      0  listening to sad songs on a monday morning otw...
31960  31961      1  @user #sikh #temple vandalised in in #calgary,...
31961  31962      0                   thank you @user for you follow  

[31962 rows x 3 columns]>

# Defining a function for text cleaning

In [65]:
#cleaning and processing text data
def cleaning_text(df, text_field):
    df[text_field]= df[text_field].str.lower()
    #It removes Twitter handles, non-alphanumeric characters, URLs, the "rt" string at the beginning of a tweet, and any remaining URLs from the text data
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df


In [67]:
test_clean = cleaning_text(test, "tweet")
train_clean = cleaning_text(train, "tweet")

In [68]:
test_clean

Unnamed: 0,id,tweet
0,31963,studiolife aislife requires passion dedication...
1,31964,white supremacists want everyone to see the ...
2,31965,safe ways to heal your acne altwaystoheal h...
3,31966,is the hp and the cursed child book up for res...
4,31967,3rd bihday to my amazing hilarious nephew el...
...,...,...
17192,49155,thought factory leftright polarisation trump u...
17193,49156,feeling like a mermaid hairflip neverready fo...
17194,49157,hillary campaigned today in ohioomg amp used w...
17195,49158,happy at work conference right mindset leads t...


In [69]:
train_clean

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so sel...
1,2,0,thanks for lyft credit i cant use cause they...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in u...
4,5,0,factsguide society now motivation
...,...,...,...
31957,31958,0,ate isz that youuu
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,sikh temple vandalised in in calgary wso cond...


In [70]:
train_clean.groupby(["label"]).count()["id"]
#shows that data is imbalanced

label
0    29720
1     2242
Name: id, dtype: int64

In [75]:
from sklearn.utils import resample
train_major = train_clean[train_clean.label==0]
train_minor = train_clean[train_clean.label==1]

In [77]:
#upsampling data

train_minor_upsampled = resample(train_minor,replace=True,n_samples=len(train_major),random_state=123)
train_upsampled = pd.concat([train_minor_upsampled,train_major])


In [79]:
#upsampled data
train_upsampled['label'].value_counts()

label
1    29720
0    29720
Name: count, dtype: int64

In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer  #converts a collection of text documents to a matrix of token counts.
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.linear_model import SGDClassifier

In [83]:
pipeline_SGD=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('nb',SGDClassifier()),])

In [85]:
from sklearn.model_selection import train_test_split  #utility function for splitting arrays or matrices into random train and test subsets.
X_train, X_test, Y_train, Y_test = train_test_split(train_upsampled['tweet'],train_upsampled['label'],random_state=0)

In [87]:
#training the model
model = pipeline_SGD.fit(X_train,Y_train)
Y_predict = model.predict(X_test)

In [88]:
#calculating f1_score
from sklearn.metrics import f1_score
f1_score(Y_test, Y_predict)

0.9695353643090461

In [89]:

from nltk.stem import SnowballStemmer

# Create a Snowball Stemmer for a specific language
stemmer = SnowballStemmer("english")

# Example usage
word = "carefully"
stemmed_word = stemmer.stem(word)
print(stemmed_word)


care


In [90]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z\t])|(\w+:\/\/\S+)|^rt|http.+?","",text)
    return text

In [91]:
def predict_custom_input(model,text):
    cleaned_text = clean_text(text)
    prediction = model.predict([cleaned_text])
    return prediction [0]

In [97]:
custom_text = "I love sunny days"
prediction = predict_custom_input(model,custom_text)
print(f"Prediction for custom input '{custom_text}':{prediction}")

Prediction for custom input 'I love sunny days':0
