In [1]:
!pip install numpy
!pip install pandas
!pip install nltk
!pip install scikit-learn
!pip install gensim

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     - -------------------------------------- 0.0/1.5 MB 1.9 MB/s eta 0:00:01
     ---- ----------------------------------- 0.2/1.5 MB 2.1 MB/s eta 0:00:01
     -------- ------------------------------- 0.3/1.5 MB 2.1 MB/s eta 0:00:01
     --------- ------------------------------ 0.4/1.5 MB 1.9 MB/s eta 0:00:01
     ------------- -------------------------- 0.5/1.5 MB 2.1 MB/s eta 0:00:01
     ----------------- ---------------------- 0.7/1.5 MB 2.2 MB/s eta 0:00:01
     --------------------- ------------------ 0.8/1.5 MB 2.5 MB/s eta 0:00:01
     -------------------------- ------------- 1.0/1.5 MB 2.5 MB/s eta 0:00:01
     -------------------------------- ------- 1.2/1.5 MB 2.7 MB/s eta 0:00:01
     --------------------------------- ------ 1.3/1.5 MB 2.6 MB/s eta 0:00:01
     ------------------------------------- -- 1.4/1.5 MB 2.5 MB/s eta 0:00:01
    

In [3]:
import io
import pandas as pd

data = pd.read_csv('messages.csv',encoding='windows-1252')

In [4]:
data.head()

Unnamed: 0,parent,student,created,subject,message,attachment,label
0,none,student 1,"12 January 2010, 10:49 PM",About ISDN,Can any one tell me about ISDN?,0,C-TE
1,student 1,student 2,"13 January 2010, 12:43 PM",About ISDN,(ISDN) = Integrted Services Digital Netork is ...,0,C-IN
2,student 2,student 1,"13 January 2010, 03:28 PM",About ISDN,Thank you friend.....,0,C-RA
3,student 1,student 3,"14 January 2010, 06:18 PM",About ISDN,"IDSN is basicly a digital dailup connection, y...",0,C-EX
4,student 1,student 4,"13 January 2010, 02:55 PM",About ISDN,Integrated Services Digital Network (ISDN) is ...,0,C-EX


In [5]:
data.isnull().sum()

parent        0
student       0
created       0
subject       0
message       0
attachment    0
label         0
dtype: int64

In [6]:
# Preprocess data
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def preprocess_text(text):
    # Tokenize and lowercase
    tokens = word_tokenize(text.lower())
    # Remove punctuation and other non-alphabetic characters
    tokens = [word for word in tokens if word.isalpha()]
    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# Extract Named Entities
from nltk import ne_chunk

def extract_named_entities(text):
    tokens = preprocess_text(text)
    tagged_tokens = nltk.pos_tag(tokens)
    named_entities = ne_chunk(tagged_tokens)
    return named_entities

In [8]:
# Count Named Entities
def count_named_entities(text):
    named_entities = extract_named_entities(text)
    count = sum(1 for chunk in named_entities if hasattr(chunk, 'label'))
    return count

In [9]:
# Calculate Cohesion

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

def compute_cohesion(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))

    # Tokenize each sentence into words
    tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

    # Flatten the list of lists
    words = [word for sentence in tokenized_sentences for word in sentence if word.isalnum() and word not in stop_words]

    # Compute word overlap between consecutive sentences
    word_overlap_count = 0
    for i in range(len(tokenized_sentences) - 1):
        sentence1 = set(tokenized_sentences[i])
        sentence2 = set(tokenized_sentences[i + 1])
        word_overlap_count += len(sentence1.intersection(sentence2))

    # Compute cohesion as the total word overlap normalized by the total number of words
    total_words = len(words)
    cohesion = word_overlap_count / total_words if total_words > 0 else 0

    return cohesion

In [10]:
# Calculated Coherence

import nltk
from nltk import bigrams, FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

def calculate_coherence(text):
    # Tokenize the text into sentences and words
    sentences = sent_tokenize(text)
    words = [word.lower() for sentence in sentences for word in word_tokenize(sentence)]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]

    # Calculate bigrams
    bi_grams = list(bigrams(words))

    # Calculate frequency distribution of words and bigrams
    freq_dist_words = FreqDist(words)
    freq_dist_bigrams = FreqDist(bi_grams)

    # Calculate Pointwise Mutual Information (PMI)
    coherence = sum([freq_dist_bigrams[bigram] * freq_dist_words[bigram[0]] * freq_dist_words[bigram[1]] for bigram in bi_grams])

    return coherence

In [11]:
# Calculate word count of a given text

def word_count(text):
    # Use split() to break the text into words
    words = text.split()

    # Count the number of words
    count = len(words)

    return count

In [12]:
# Add Feature to Your Dataset
import pandas as pd

# Assuming 'messages' is your DataFrame with columns 'message' and 'label'
data['named_entities_count'] = data['message'].apply(count_named_entities)
data['cohesion'] = data['message'].apply(compute_cohesion)
data['coherence'] = data['message'].apply(calculate_coherence)
data['wc'] = data['message'].apply(word_count)

In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Assuming your dataset is stored in a DataFrame named 'data'
# Preprocess the 'created' column
data['created'] = pd.to_datetime(data['created'])
data['hour'] = data['created'].dt.hour  # Extract hour as a feature

# Combine text features
data['combined_text'] = data[['parent', 'student', 'subject', 'message']].astype(str).agg(' '.join, axis=1)

# Tokenize the text
data['tokenized_text'] = data['combined_text'].apply(lambda x: word_tokenize(x.lower()))

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=data['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4)

# Function to average Word2Vec vectors for a text
def average_word2vec(tokens, model, vector_size):
    vector_sum = sum(model.wv[word] for word in tokens if word in model.wv)
    return vector_sum / len(tokens) if len(tokens) > 0 else np.zeros(vector_size)

# Create Word2Vec vectors for each text
data['word2vec'] = data['tokenized_text'].apply(lambda x: average_word2vec(x, word2vec_model, vector_size=100))

# Convert Word2Vec vectors to DataFrame columns
word2vec_columns = pd.DataFrame(data['word2vec'].to_list(), columns=[f'w2v_{i}' for i in range(100)])

# Combine Word2Vec features, hour, and other features
X_combined = pd.concat([word2vec_columns, data[['hour', 'named_entities_count', 'cohesion', 'coherence', 'wc']]], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, data['label'], test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Make predictions on the test set
predictions = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

# Evaluate the model
print(classification_report(y_test, predictions))


Accuracy: 0.38524590163934425
              precision    recall  f1-score   support

        C-EX       0.31      0.36      0.33        39
        C-IN       0.48      0.57      0.52        51
        C-RA       0.00      0.00      0.00        15
        C-TE       0.27      0.24      0.25        17

    accuracy                           0.39       122
   macro avg       0.26      0.29      0.28       122
weighted avg       0.34      0.39      0.36       122



In [16]:
import pickle

# Save the model to a file using pickle
model_filename = "rf_model.pickle"

with open(model_filename, 'wb') as file:
    pickle.dump((rf_model,word2vec_model), file)