In [None]:
import numpy as np
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
import matplotlib.pyplot as plt

# LOAD DATA

In [None]:
# Create dataset: join the info from the 2 channels, change the channel names to 0 and 1.

def create_dataset(dataframe1, dataframe2):
    # Load both pandas: 
    
    df1 = pd.read_csv(dataframe1)
    df2 = pd.read_csv(dataframe2)

    # Join the two pandas
    final_df = pd.concat([df1, df2], join="inner")

    # Change the channel name to 0 or 1 (binary classifier):
    final_df["channel"] = np.where(final_df["channel"] == "#pgl", 0, 1)
    
    # Drop these 2 columns since they don't provice much information for our problem:

    final_df.drop("datetime", axis=1, inplace=True)
    final_df.drop("Unnamed: 0", axis=1, inplace=True)
    
    return final_df

In [None]:
final_df = create_dataset(r"/kaggle/input/dataframes/dataset1.csv", r"/kaggle/input/dataframes/df1.csv")
print(final_df.shape)

In [None]:
final_df

# LOWERCASE

In [None]:
# Notice that we want Sleep = SLEEP = SlEEp = sleeP ETC
final_df["message"] = final_df["message"].str.lower()

# REMOVE RAW IF ANY NAN IN IT

In [None]:
final_df  = final_df.dropna()
print(final_df.shape)

# REMOVE STOPWORDS AND PUNCTUATION SYMBOLS

In [None]:
# Import stopwords with scikit-learn
# Remove words like: can, could, will, been, would...
from sklearn.feature_extraction import text
stop = text.ENGLISH_STOP_WORDS
stop_words = list(stop) + list(string.punctuation )

def clean_text ( text ) :
    words_List = nltk.word_tokenize(text)
    final_list = [ elto for elto in words_List if elto not in stop_words ]
    return (" ". join ( final_list ))

In [None]:
final_df["message"] = final_df["message"].apply( clean_text )

In [None]:
final_df

# STEMMING

In [None]:
# Stemming is the process of reducing inflection in words (e.g. connection, connects, connected) to their root form (e.g. connect). 
# Use English stemmer.
stemmer = SnowballStemmer("english")

final_df['message'] = final_df['message'].astype(str).str.split()
final_df['message'] = final_df['message'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

print(final_df.shape)


# DROP THE ROWS THAT DON'T HAVE ANY MESSAGE! 

In [None]:
indexes_to_remove = []
for i, element in enumerate(final_df["message"]):
    if len(element) == 0:
        indexes_to_remove.append(i)
        
# We want to drop the rows that are in this index!  
print(indexes_to_remove)

In [None]:
final_df = final_df.drop(final_df.index[indexes_to_remove])

In [None]:
final_df.shape

# SHOW A WORD CLOUD FOR EACH CHANNEL (MOST COMMON WORDS)

In [None]:
from wordcloud import WordCloud 


# Select the messages for each class
df1_clean = final_df[final_df["channel"] == 0]
df2_clean = final_df[final_df["channel"] == 1]

# Create string of the messages to insert it into the wordcloud
df1_messages_to_string = []
for element in df1_clean["message"]:
    for i in element:
        df1_messages_to_string.append(i)
        
df2_messages_to_string = []
for element in df2_clean["message"]:
    for i in element:
        df2_messages_to_string.append(i)
        
word_cloud_df1 = WordCloud(collocations = False, background_color = 'white').generate(" ".join(df1_messages_to_string))
word_cloud_df2 = WordCloud(collocations = False, background_color = 'white').generate(" ".join(df2_messages_to_string))

In [None]:
# Display the generated image:
plt.imshow(word_cloud_df1, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Display the generated image:
plt.imshow(word_cloud_df2, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Transform to string

def list_to_string(text ) :
    text = " ".join(text)
    return text

final_df["message"] = [list_to_string(elto) for elto in final_df["message"]]

In [None]:
final_df

# DIVIDE DATA: TRAIN & SPLIT

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_df["message"], final_df["channel"], test_size=0.2, random_state=42)

# NAIVE BAYES CLASSIFIER

In [None]:
# ### SKLEARN MULTINOMIAL NAIVE BAYES

from sklearn . naive_bayes import MultinomialNB
from sklearn import metrics
import sklearn . feature_extraction . text as txt
from sklearn . pipeline import Pipeline
from sklearn . feature_extraction . text import TfidfVectorizer

In [None]:
# Equivalent to CountVectorizer followed by TfidfTransformer .
tf = TfidfVectorizer()

In [None]:
X_train

In [None]:
# Return a document - term matrix
vectors = tf.fit_transform(X_train)
vectors_test = tf.transform(X_test)
print (vectors.shape , vectors_test.shape)

In [None]:
# train model
clfNB = MultinomialNB ( alpha =0.01)
clfNB.fit(vectors , y_train)

In [None]:
# test scores
pred = clfNB.predict(vectors_test)

In [None]:
pred

In [None]:
clfNB.score(vectors_test , y_test)

In [None]:
mat = metrics.confusion_matrix (y_test , pred )

In [None]:
mat

In [None]:
df_cm = pd.DataFrame( mat)
plt.figure(figsize =(10 ,10) )
sn.heatmap(df_cm, annot = True)
plt.show ()

In [None]:
clfNB.score(vectors, y_train)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))