In [1]:
import io
import pandas as pd

data = pd.read_csv('messages.csv',encoding='windows-1252')

In [2]:
data.head()

Unnamed: 0,parent,student,created,subject,message,attachment,label
0,none,student 1,"12 January 2010, 10:49 PM",About ISDN,Can any one tell me about ISDN?,0,C-TE
1,student 1,student 2,"13 January 2010, 12:43 PM",About ISDN,(ISDN) = Integrted Services Digital Netork is ...,0,C-IN
2,student 2,student 1,"13 January 2010, 03:28 PM",About ISDN,Thank you friend.....,0,C-RA
3,student 1,student 3,"14 January 2010, 06:18 PM",About ISDN,"IDSN is basicly a digital dailup connection, y...",0,C-EX
4,student 1,student 4,"13 January 2010, 02:55 PM",About ISDN,Integrated Services Digital Network (ISDN) is ...,0,C-EX


In [3]:
data.isnull().sum()

parent        0
student       0
created       0
subject       0
message       0
attachment    0
label         0
dtype: int64

In [4]:
# Preprocess data
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def preprocess_text(text):
    # Tokenize and lowercase
    tokens = word_tokenize(text.lower())
    # Remove punctuation and other non-alphabetic characters
    tokens = [word for word in tokens if word.isalpha()]
    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Extract Named Entities
from nltk import ne_chunk

def extract_named_entities(text):
    tokens = preprocess_text(text)
    tagged_tokens = nltk.pos_tag(tokens)
    named_entities = ne_chunk(tagged_tokens)
    return named_entities

In [6]:
# Count Named Entities
def count_named_entities(text):
    named_entities = extract_named_entities(text)
    count = sum(1 for chunk in named_entities if hasattr(chunk, 'label'))
    return count

In [7]:
# Calculate Cohesion

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

def compute_cohesion(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))

    # Tokenize each sentence into words
    tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

    # Flatten the list of lists
    words = [word for sentence in tokenized_sentences for word in sentence if word.isalnum() and word not in stop_words]

    # Compute word overlap between consecutive sentences
    word_overlap_count = 0
    for i in range(len(tokenized_sentences) - 1):
        sentence1 = set(tokenized_sentences[i])
        sentence2 = set(tokenized_sentences[i + 1])
        word_overlap_count += len(sentence1.intersection(sentence2))

    # Compute cohesion as the total word overlap normalized by the total number of words
    total_words = len(words)
    cohesion = word_overlap_count / total_words if total_words > 0 else 0

    return cohesion

In [8]:
# Calculated Coherence

import nltk
from nltk import bigrams, FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

def calculate_coherence(text):
    # Tokenize the text into sentences and words
    sentences = sent_tokenize(text)
    words = [word.lower() for sentence in sentences for word in word_tokenize(sentence)]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]

    # Calculate bigrams
    bi_grams = list(bigrams(words))

    # Calculate frequency distribution of words and bigrams
    freq_dist_words = FreqDist(words)
    freq_dist_bigrams = FreqDist(bi_grams)

    # Calculate Pointwise Mutual Information (PMI)
    coherence = sum([freq_dist_bigrams[bigram] * freq_dist_words[bigram[0]] * freq_dist_words[bigram[1]] for bigram in bi_grams])

    return coherence

In [9]:
# Calculate word count of a given text

def word_count(text):
    # Use split() to break the text into words
    words = text.split()

    # Count the number of words
    count = len(words)

    return count

In [10]:
# Add Feature to Your Dataset
import pandas as pd

# Assuming 'messages' is your DataFrame with columns 'message' and 'label'
data['named_entities_count'] = data['message'].apply(count_named_entities)
data['cohesion'] = data['message'].apply(compute_cohesion)
data['coherence'] = data['message'].apply(calculate_coherence)
data['wc'] = data['message'].apply(word_count)

In [11]:
!pip install gensim



You should consider upgrading via the 'c:\users\user\label predictor\model\venv\scripts\python.exe -m pip install --upgrade pip' command.


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize


# Prepare the text data for Word2Vec
sentences = [word_tokenize(text) for text in data['message']]
vector_size = 100  # You can adjust the vector size based on your dataset and requirements

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=vector_size, window=5, min_count=1, sg=0)

# Function to convert text to average Word2Vec vectors
def text_to_vector(text):
    words = word_tokenize(text)
    vec_sum = np.zeros(vector_size)
    word_count = 0
    for word in words:
        if word in word2vec_model.wv:
            vec_sum += word2vec_model.wv[word]
            word_count += 1
    if word_count > 0:
        return vec_sum / word_count
    else:
        return vec_sum

# Convert text data to Word2Vec vectors
data['message_vector'] = data['message'].apply(text_to_vector)

# Select features (X) and target labels (Y)
X = data['message_vector']
Y = data['label']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)  # Using StratifiedKFold for classification
# accuracy_scores = cross_val_score(rf_classifier, X['message_vector'].tolist(), Y, cv=cv, scoring='accuracy')
rf_classifier.fit(X_train.tolist(), Y_train)

# Make predictions on the test set
Y_pred = rf_classifier.predict(X_test.tolist())

# Evaluate the model
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(Y_test, Y_pred))

# Evaluate the model
print(classification_report(Y_test, Y_pred))


Accuracy: 0.32786885245901637
              precision    recall  f1-score   support

        C-EX       0.27      0.31      0.29        39
        C-IN       0.43      0.49      0.46        51
        C-RA       0.00      0.00      0.00        15
        C-TE       0.20      0.18      0.19        17

    accuracy                           0.33       122
   macro avg       0.22      0.24      0.23       122
weighted avg       0.29      0.33      0.31       122



In [13]:
import pickle

# Save the model to a file using pickle
model_filename = "rf_model.pickle"

with open(model_filename, 'wb') as file:
    pickle.dump((rf_classifier,word2vec_model), file)