### Business Objective
The primary goal of this assignment is to develop a machine learning model that can accurately classify tweets related to disasters as either real or fake.

In [2]:
import pandas as pd
import numpy as np
df=pd.read_csv("Disaster_tweets_NB.csv")
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### Data Preprocessing

In [4]:
df.shape

(7613, 5)

In [5]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

**keyword and location must be removed as it conatin null values and there is no meaning to add for model building**

**we need only text and target column**

In [11]:
df=df[['text','target']]
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [13]:
df.shape

(7613, 2)

In [15]:
df.isnull().sum()

text      0
target    0
dtype: int64

In [17]:
X = df['text']
y = df['target']

In [19]:
X.shape

(7613,)

In [21]:
y.shape

(7613,)

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
X_train.shape

(6090,)

In [25]:
X_test.shape

(1523,)

In [26]:
y_train.shape

(6090,)

In [27]:
y_test.shape

(1523,)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [37]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(X_train_vectorized ,y_train)
y_pred=model.predict(X_test_vectorized)

In [39]:
y_pred

array([0, 0, 0, ..., 1, 1, 0], dtype=int64)

In [41]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy:{accuracy:.2f}")

Accuracy:0.79


In [43]:
print(classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

              precision    recall  f1-score   support

           0       0.80      0.84      0.82       874
           1       0.77      0.71      0.74       649

    accuracy                           0.79      1523
   macro avg       0.78      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523

Confusion Matrix:
[[735 139]
 [185 464]]


In [45]:
def predict_tweet(tweet):
    tweet_vectorized = vectorizer.transform([tweet])
    prediction = model.predict(tweet_vectorized)
    return "Real Tweet" if prediction[0] == 1 else "Fake Tweet"

# Example usage of the prediction function
new_tweet = "Breaking news: A major earthquake has struck the city."
print(predict_tweet(new_tweet))

Real Tweet
