In [28]:
# importing libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score
from imblearn.under_sampling import RandomUnderSampler

In [29]:
df = pd.read_csv("D:\\SensViz\\Machine Learning Task 3\\train.csv")

df.shape

(7613, 5)

In [30]:
df.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [31]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [32]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [33]:
# to check the data is balanced or not
class_counts = df['target'].value_counts()
print(class_counts)

0    4342
1    3271
Name: target, dtype: int64


In [34]:
# selecting the target and text coloumn and drop the null rows
new_df = df[['text', 'target']]
new_df = new_df.dropna()

In [35]:
#using the under sampling to balance the data
rus = RandomUnderSampler(random_state=42)
X_resampled, Y_resampled = rus.fit_resample(new_df[['text']], new_df['target'])
new_df = pd.concat([X_resampled, Y_resampled], axis=1)

In [36]:
#preprocessing of text data and some prerquists 
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Umair\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Umair\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [37]:
# function for preprocessing text
def preprocess_text(text):
# tokenize the text
    tokenized = word_tokenize(text.lower())
# remove stop words
    filtered = [token for token in tokenized if token not in stop_words]
# remove special characters and HTML tags
    cleaned = [re.sub(r'https?://.[\r\n]', '', token) for token in filtered]
    cleaned = [re.sub(r'@[A-Za-z0-9]+', '', token) for token in cleaned]
    cleaned = [re.sub(r'#', '', token) for token in cleaned]
    cleaned = [re.sub(r'[^\w\s]', '', token) for token in cleaned]
    cleaned = [re.sub(r'<.*?>', '', token) for token in cleaned]
# perform stemming
    stemmed = [porter.stem(token) for token in cleaned]
    return ' '.join(stemmed)

new_df['text'] = new_df['text'].apply(preprocess_text)

In [38]:
# vectorized the data and accessing the values to X and Y
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(new_df['text'])
Y = new_df['target']


In [39]:
# in the train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [40]:
#training the logestic regression model

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
y_pred = logreg.predict(X_test)

In [41]:
# evaluate the model
Y_pred = logreg.predict(X_test)
print(classification_report(Y_test, Y_pred))
print(confusion_matrix(Y_test, Y_pred))

print('Accuracy:', accuracy_score(Y_test, Y_pred))
print('F1 score:', f1_score(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.78      0.83      0.80       339
           1       0.81      0.75      0.78       316

    accuracy                           0.79       655
   macro avg       0.79      0.79      0.79       655
weighted avg       0.79      0.79      0.79       655

[[282  57]
 [ 80 236]]
Accuracy: 0.7908396946564885
F1 score: 0.7750410509031199


In [42]:
#Loading the Tst data
df_test = pd.read_csv("D:\\SensViz\\Machine Learning Task 3\\test.csv")
# applying the preprocessing function on the test data
df_test_text = df_test['text'].apply(preprocess_text)

In [43]:
#vectorized the data
df_test_text = vectorizer.transform(df_test_text)

In [44]:
#pridict the labels
y_pred_LR = logreg.predict(df_test_text)
print(y_pred_LR)

[1 1 1 ... 1 1 0]


In [45]:
# create the submission file 
submission = pd.DataFrame({'id': df_test['id'], 'target': y_pred_LR})
submission.to_csv('submission.csv', index=False)

In [46]:
# now calculated the F1
f1 = f1_score(Y_test, Y_pred, average='weighted')
print('F1 score:', f1)

F1 score: 0.7903213368728546
