<a href="https://colab.research.google.com/github/MarehWilliams01/nlp/blob/main/Natural_Language_Processing_With_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install emoji

In [None]:
# Importing the neccessary libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import emoji
from bs4 import BeautifulSoup
import requests
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# Importing the test and train dataset
df_train = pd.read_csv('/content/drive/MyDrive/Datasets/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Datasets/test.csv')

remove = ['id', 'keyword', 'location']

df_train = df_train.drop(remove, axis=1)
df_test = df_test.drop(remove, axis=1)

df_train
df_test

In [None]:
# Developing a dictionary for shorthand texts
# Sending A GET request
url = 'https://messente.com/blog/text-abbreviations'
response = requests.get(url)

# Parsing the HTML content with BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

#Extracting the data from the second and third 'b' tags in the html page
b_tags = soup.find_all('b')
second_b_tag = b_tags[1]
third_b_tag = b_tags[2]

#Finding all but the first paragraph in the second and the third b tags
second_paragraphs = second_b_tag('p')[1:]
third_paragraphs = third_b_tag('p')[1:]

#Extracting the slangs with their respective descriptions and storing it in a dataframe
slang_list = []
description_list = []
count = 0

for paragraph in second_paragraphs + third_paragraphs:
  if count <= 99:
    split_text = paragraph.text.strip().split(" – ", 1)
    slang = split_text[0].split(". ", 1)[-1].lower()
    description = split_text[1] if len(split_text) > 1 else ""
    slang_list.append(slang)
    description_list.append(description)
    count += 1

  else:
    break

df_slangs = pd.DataFrame({
    "slang": slang_list,
    "description": description_list
})

df_slangs

In [160]:
# Preprocessing the data

# Developing a function to clean the data
def cleaning_tweet(text):
  text = re.sub(r'@[A-Za-z0-9]+', '', text) # removes mentions
  text = re.sub(r'#', '', text) # removes hashtags
  text = re.sub(r'RT[\s]+', '', text) # removes retweets
  text = re.sub(r'https?:\/\/\S+', '', text) # removes hyperlinks
  text = re.sub(r'\.', '.', text) # removes repeated fullstops
  text = re.sub(r'!', '', text) # removes repeated exclammation marks
  text = re.sub(r'\?', '', text) # removes repeated question marks
  text = re.sub(r'\s+', ' ', text) # removes extra space around text
  text = emoji.demojize(text, delimiters=("", "")) # replacing the emojis with respective labels
  # removes slangs
  words = text.split() # splits text into each word
  normalized_words = [df_slangs.loc[df_slangs['slang'] == word, "description"].values[0]
    if word in df_slangs['slang'].values else word for word in words]
  text = " ".join(normalized_words)

  return text.lower()

df_train['text'] = df_train['text'].apply(cleaning_tweet)
df_test['text'] = df_test['text'].apply(cleaning_tweet)

In [None]:
# Lemmatization
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize each word in a tweet
def lemmatize_text(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas)

df_train['text'] = df_train['text'].apply(lemmatize_text)
df_test['text'] = df_test['text'].apply(lemmatize_text)

In [None]:
# Removing stopwords

# Getting the current list of English stopwords
stopword_list = stopwords.words('english')

# Function to remove stopwords from tweet
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    text = ' '.join(filtered_tokens)
    return text

# Apply stopword removal to the 'tweet' column
df_train['text'] = df_train['text'].apply(remove_stopwords)

df_train

In [176]:
# Building Vectors
count_vectorizer = CountVectorizer()
train_vectors = count_vectorizer.fit_transform(df_train['text'])
test_vectors = count_vectorizer.transform(df_test['text'])

train_vectors
test_vectors

tfidf_transformer = TfidfTransformer()
tfidf_train = tfidf_transformer.fit_transform(train_vectors)
tfidf_test = tfidf_transformer.transform(test_vectors)

In [None]:
# Building Our Model
labels = df_train['target']
X_train, X_test, y_train, y_test = train_test_split(tfidf_train, labels, test_size=0.2, random_state=42)

# Model1: MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

# Metrics

train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
train_precision = precision_score(y_train, train_predictions)
train_recall = recall_score(y_train, train_predictions)
train_f1 = f1_score(y_train, train_predictions)


test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)

print(train_accuracy, train_precision, train_recall, train_f1)
print( test_accuracy, test_precision, test_recall, test_f1)


In [None]:
# Model2: Ridge Classifier

model = RidgeClassifier()
model.fit(X_train, y_train)

# Metrics

train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
train_precision = precision_score(y_train, train_predictions)
train_recall = recall_score(y_train, train_predictions)
train_f1 = f1_score(y_train, train_predictions)


test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)

print(train_accuracy, train_precision, train_recall, train_f1)
print( test_accuracy, test_precision, test_recall, test_f1)

In [None]:
# Model3: SVC

model = svm.LinearSVC()
model.fit(X_train, y_train)

# Metrics

train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
train_precision = precision_score(y_train, train_predictions)
train_recall = recall_score(y_train, train_predictions)
train_f1 = f1_score(y_train, train_predictions)


test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)

print(train_accuracy, train_precision, train_recall, train_f1)
print( test_accuracy, test_precision, test_recall, test_f1)

In [184]:
# Predicting sample submission values and storing them

y_train_t = df_train['target']
model.fit(tfidf_train, y_train_t)
pred = model.predict(tfidf_test)

sample_submission = pd.read_csv('/content/drive/MyDrive/Datasets/sample_submission.csv')
sample_submission['target'] = pred

sample_submission

sample_submission.to_csv("submission.csv", index=False)