In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, STOPWORDS
from gensim.models import word2vec
import spacy
import nltk

nlp = spacy.load("en_core_web_sm")

from warnings import filterwarnings

filterwarnings("ignore")

In [None]:
df = pd.read_csv("Data/cleaned_data.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Body Features Demographic Distribution

print("Max length of the body: ", df["Body"].str.len().max())
print("Min length of the body: ", df["Body"].str.len().min())
print("Mean length of the body: ", df["Body"].str.len().mean())
print("Median length of the body: ", df["Body"].str.len().median())

In [None]:
# plot distribution of tag count
sns.countplot(data=df, x="Tags Count", color="blue", palette="viridis")
plt.title("Distribution of tag count")
plt.ylabel("Frequency")
plt.xlabel("Tag count")
plt.show()

In [None]:
# vectorize tags
tag_vectorizer = CountVectorizer(tokenizer=lambda x: str(x).split())
tag_mat = tag_vectorizer.fit_transform(df["Tags"])

In [None]:
# get names of tags
tag_names = tag_vectorizer.get_feature_names_out()
type(tag_names), len(tag_names)

In [None]:
tag_names[:20]

In [None]:
tag_freq = tag_mat.sum(axis=0)

In [None]:
# store tag names and frequency as a pandas series
tag_freq_ser = pd.Series(tag_freq.A1, index=tag_names)
tag_freq_ser.sort_values(ascending=False, inplace=True)
tag_freq_ser.head(10)

In [None]:
# Frequency top 50 tags
fig = plt.figure(figsize=[20, 10])
sns.barplot(
    x=tag_freq_ser.iloc[:50].index,
    y=tag_freq_ser.iloc[:50].values,
    color=sns.xkcd_rgb["greenish cyan"],
)
plt.title("Frequency of top 50 Tags")
plt.xlabel("Tags")
plt.ylabel("Frequency")
plt.xticks(rotation=90)
plt.show()

In [None]:
# plot distribution of tag frequency (top 500)
fig = plt.figure(figsize=[10, 7])
plt.plot(tag_freq_ser.iloc[:500].values, c="blue")
plt.title("Tag frequency distribution of top 500 Tags")
plt.ylabel("Frequency")
plt.xlabel("Tag ID")
plt.show()

In [None]:
# plot distribution of tag frequency (top 100)
fig = plt.figure(figsize=[10, 7])
plt.plot(tag_freq_ser.iloc[:100].values, c="blue")
plt.title("Tag frequency distribution of top 100 Tags")
plt.ylabel("Frequency")
plt.xlabel("Tag ID")
plt.show()

In [None]:
# plot distribution of tag frequency (top 50)
fig = plt.figure(figsize=[10, 7])
plt.plot(tag_freq_ser.iloc[:50].values, c="blue")
plt.title("Tag frequency distribution of top 50 Tags")
plt.ylabel("Frequency")
plt.xlabel("Tag ID")
plt.show()

In [None]:
# plot word count for tags
wordcloud = WordCloud(
    background_color="black",
    max_words=200,
    scale=10,
).generate_from_frequencies(tag_freq_ser)
fig = plt.figure(figsize=[10, 10])
plt.title("WordCloud of Tags")
plt.axis("off")
plt.imshow(wordcloud)
plt.show()

In [None]:
df["Text"] = df["Head"] + " " + df["Body"]
# Create a list of stopwords
stopwords_nltk = nltk.corpus.stopwords.words("english")
stopwords_spacy = spacy.lang.en.stop_words.STOP_WORDS
stopwords = list(
    set(
        stopwords_nltk
        + list(stopwords_spacy)
        + list(STOPWORDS)
        + list("abdefghijklmnopqstuvwxyz")
    )
)
print(len(stopwords))

In [None]:
def remove_stopwords(text):
    return " ".join(
        [word for word in str(text).split() if word.strip() not in stopwords]
    )

df["Text_Uncleaned"] = df["Text"].apply(lambda x: remove_stopwords(x))

In [None]:
# Create Word2Vec model from text column

list_of_sent = [sent.split() for sent in df["Text_Uncleaned"].values]

w2v_model = word2vec.Word2Vec(list_of_sent, vector_size=100, window=5, workers=4)

w2v_model.wv.most_similar("detection")

### Cleaning the Body Feature for Modeling
- #### Lemmatization
- #### Remove stopwords
- #### Remove extra spaces

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def clean_body(text):
    text = str(text).lower()
    text = " ".join([word for word in text.split() if word not in stopwords])
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

df["Text_Cleaned"] = df["Text"].apply(clean_body)


In [None]:
# Create Word2Vec model from text column

list_of_sent = [sent.split() for sent in df["Text_Cleaned"].values]

w2v_model = word2vec.Word2Vec(list_of_sent, vector_size=100, window=5, workers=4)

w2v_model.wv.most_similar("detection")