In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [35]:
# Loading submission data
submission_data=pd.read_csv("sample_submission.csv")
submission_data=submission_data.drop(['id'],axis=1)
submission_data.head()

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0


In [36]:
# Loading test data
test_data=pd.read_csv("test.csv")
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [37]:
# Loading data
data=pd.read_csv("train.csv")
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [38]:
# Clearing id column
data=data.drop(['id','location'],axis=1)
data.head()

Unnamed: 0,keyword,text,target
0,,Our Deeds are the Reason of this #earthquake M...,1
1,,Forest fire near La Ronge Sask. Canada,1
2,,All residents asked to 'shelter in place' are ...,1
3,,"13,000 people receive #wildfires evacuation or...",1
4,,Just got sent this photo from Ruby #Alaska as ...,1


In [39]:
data.shape

(7613, 3)

In [40]:
# checking for null values
data.isnull().sum().sort_values(ascending=False)


Unnamed: 0,0
keyword,61
text,0
target,0


In [41]:
# Imputting the missing values
data['keyword']=data['keyword'].fillna("missing")


In [57]:
# Encode the columns
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(handle_unknown="ignore", sparse_output=True)
keyword_encoded=ohe.fit_transform(data[['keyword']])  # It will return a sparse matrix

In [58]:
data.isnull().sum()

Unnamed: 0,0
keyword,0
text,0
target,0


In [59]:
data['keyword'].value_counts()

Unnamed: 0_level_0,count
keyword,Unnamed: 1_level_1
missing,61
fatalities,45
deluge,42
armageddon,42
damage,41
...,...
forest%20fire,19
epicentre,12
threat,11
inundation,10


In [60]:
data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


***Preprocessing Text data***

In [61]:

test_data=test_data.drop(['id','location'],axis=1)
test_data.head()

KeyError: "['id', 'location'] not found in axis"

In [62]:
# Imputting the missing values
test_data['keyword']=test_data['keyword'].fillna("missing")


In [63]:
# Encode the columns

test_keyword_encoded=ohe.transform(test_data[['keyword']])  # It will return a sparse matrix

In [64]:
# NLP
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
lemmatizer=WordNetLemmatizer()
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [65]:
lemmatized_data=[]
for each in data['text']:
  review=re.sub('[^a-zA-Z]',' ',each)
  review=review.lower()
  review=review.split()
  review=[lemmatizer.lemmatize(word) for word in review if word not in stopwords.words("english")]
  review=" ".join(review)
  lemmatized_data.append(review)

data['text']=lemmatized_data
print(data['text'])


0              deed reason earthquake may allah forgive u
1                   forest fire near la ronge sask canada
2       resident asked shelter place notified officer ...
3       people receive wildfire evacuation order calif...
4       got sent photo ruby alaska smoke wildfire pour...
                              ...                        
7608    two giant crane holding bridge collapse nearby...
7609    aria ahrary thetawniest control wild fire cali...
7610             utc km volcano hawaii http co zdtoyd ebj
7611    police investigating e bike collided car littl...
7612    latest home razed northern california wildfire...
Name: text, Length: 7613, dtype: object


In [14]:
data.head()

Unnamed: 0,keyword,text,target
0,missing,deed reason earthquake may allah forgive u,1
1,missing,forest fire near la ronge sask canada,1
2,missing,resident asked shelter place notified officer ...,1
3,missing,people receive wildfire evacuation order calif...,1
4,missing,got sent photo ruby alaska smoke wildfire pour...,1


In [72]:
from tensorflow.keras.preprocessing.text import one_hot
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(max_features=10000)
tfidf_vectors=tf.fit_transform(data['text'])


***Tokenizing the text column of testing data***

In [73]:
lemmatized_test_data=[]
for each in test_data['text']:
  review=re.sub('[^a-zA-Z]',' ',each)
  review=review.lower()
  review=review.split()
  review=[lemmatizer.lemmatize(word) for word in review if word not in stopwords.words("english")]
  review=" ".join(review)
  lemmatized_test_data.append(review)

test_data['text']=lemmatized_test_data
print(test_data['text'])

0                             happened terrible car crash
1       heard earthquake different city stay safe ever...
2       forest fire spot pond goose fleeing across str...
3                    apocalypse lighting spokane wildfire
4                      typhoon soudelor kill china taiwan
                              ...                        
3258    earthquake safety los angeles safety fastener ...
3259    storm ri worse last hurricane city amp others ...
3260     green line derailment chicago http co utbxlcbiuy
3261    meg issue hazardous weather outlook hwo http c...
3262    cityofcalgary activated municipal emergency pl...
Name: text, Length: 3263, dtype: object


In [74]:
# vectorizing text column of testing data
test_tfidf_vectors=tf.transform(test_data['text'])  # on testing data just transform not train(fit)

In [75]:
for each in tfidf_vectors:
  print("sentence 1 index,values obtained by tfidf vectorizer")
  print(each)
  print("Sentence 1 in matrix form")
  print(each.toarray())
  break

sentence 1 index,values obtained by tfidf vectorizer
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6 stored elements and shape (1, 10000)>
  Coords	Values
  (0, 1370)	0.4917752267617825
  (0, 8109)	0.3600775896596117
  (0, 1658)	0.33751907334621206
  (0, 7011)	0.3037960127435994
  (0, 159)	0.4247908189689651
  (0, 2909)	0.4917752267617825
Sentence 1 in matrix form
[[0. 0. 0. ... 0. 0. 0.]]


In [79]:
feature_names = tf.get_feature_names_out()
# showing the words at given index of tfidf vector for sentence 1
print(feature_names[1054])
print(feature_names[3725])
print(feature_names[1342])
print(feature_names[2630])
print(feature_names[102])

coastal
grant
ddg
fidfn
af


***Combine with TF-IDF features***

In [80]:
# Now just horizontally stack the two sparse matrices of training dataset:
from scipy.sparse import hstack
X_train = hstack([tfidf_vectors, keyword_encoded])


In [81]:
# Now just horizontally stack the two sparse matrices of training dataset:
from scipy.sparse import hstack
X_test = hstack([test_tfidf_vectors,test_keyword_encoded])
y_train=data['target']

***Performing task with Machine Learning***

In [87]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(X_train,y_train)
y_pred2=model.predict(X_test)
y_pred2.shape


(3263,)

In [85]:
y_test=pd.read_csv("sample_submission.csv")
y_test=y_test.drop(['id'],axis=1)
y_test.shape

(3263, 1)

In [88]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred2))

0.6708550413729697


***Performing task with deep learning***

In [None]:
# Using Deep learning
from tensorflow.keras.layers import Dense,Bidirectional,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.activations import linear,sigmoid


In [90]:
# Layers
model = Sequential()
model.add(Dense(128, activation='relu'))  # In hidden layers(4 here) the relu activation is used
model.add(Dropout(0.3))  # 30 of neurons will be droped
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Output layer
model.build((None, 10222))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



In [94]:
# Model training
model.fit(X_train,y_train,epochs=5,batch_size=1000,validation_data=(X_test,y_test))

# validation is to show how generalized the model is , how well model will perform on unseen data


Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2s/step - accuracy: 0.5647 - loss: 0.6753 - val_accuracy: 1.0000 - val_loss: 0.5485
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2s/step - accuracy: 0.5689 - loss: 0.6534 - val_accuracy: 1.0000 - val_loss: 0.4905
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2s/step - accuracy: 0.5983 - loss: 0.5983 - val_accuracy: 0.8020 - val_loss: 0.4417
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 2s/step - accuracy: 0.7867 - loss: 0.5117 - val_accuracy: 0.6604 - val_loss: 0.4692
Epoch 5/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2s/step - accuracy: 0.8788 - loss: 0.3989 - val_accuracy: 0.5820 - val_loss: 0.7146


<keras.src.callbacks.history.History at 0x7a55732d84d0>

In [95]:
# Getting predictions
y_pred=model.predict(X_test)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 105ms/step


In [96]:
# Checking testing accuracy
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
testing_accuracy=accuracy_score(y_test,y_pred.round())
print(testing_accuracy)

0.581979773214833


In [97]:
# Classification report
print(classification_report(y_test,y_pred.round()))


              precision    recall  f1-score   support

         0.0       1.00      0.58      0.74      3263
         1.0       0.00      0.00      0.00         0

    accuracy                           0.58      3263
   macro avg       0.50      0.29      0.37      3263
weighted avg       1.00      0.58      0.74      3263



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Confusion matrix
print(confusion_matrix(y_test,y_pred.round()))

In [99]:
# Converting obtained y_pred2 in the csv file

# Converting y_pred2 in the one-dimension
y_pred2 = y_pred2.flatten()


# If y_pred is a NumPy array or list
# Convert the NumPy array to a DataFrame
df = pd.DataFrame({'prediction': y_pred2})
df.to_csv('predictions.csv', index=False)


In [6]:
import numpy
import pandas
#import seaborn
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

ModuleNotFoundError: No module named 'matplotlib'