In [1]:
import pandas as pd

In [2]:
# Read the CSV file
df = pd.read_csv('dataset.csv')

df.head()

Unnamed: 0,id,text,lemma,upos,xpos,head,deprel,start_char,end_char
0,1,اس,یہ,DET,DEM,2,det,0,2
1,2,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8
2,3,کی,کا,ADP,PSP,2,case,9,11
3,4,دیگر,دیگر,ADJ,JJ,5,amod,12,16
4,5,اقساط,اقساط,NOUN,NN,7,nsubj,17,22


In [3]:
data = df.drop(columns=['id'])

In [4]:
# Initialize a new column 'y' with the default value 'S_M'
data['y'] = 'S_M'

# Iterate through the rows to assign 'S_B'
for i in range(len(data) - 1):
    # Check if the current word ends with a full stop
    if data.loc[i, 'text'].endswith('۔'):
        # Assign 'S_B' to the next word
        if i + 1 < len(data):
            data.loc[i + 1, 'y'] = 'S_B'  # Sentence Beginning

# Convert 'y' column to categorical type (optional, for ML efficiency)
data['y'] = data['y'].astype('category')

# Map categorical labels to numeric values
label_mapping = {'S_B': 1, 'S_M': 0}
data['y'] = data['y'].map(label_mapping)

# Verify the result
data.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
7,۔,۔,PUNCT,SYM,7,punct,33,34,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0


In [5]:
# Drop rows where the 'text' column contains only punctuation
data = data[~data['text'].str.contains(r'^[^\w\s]+$', na=False)]

# Verify the result
data.head(10)

Unnamed: 0,text,lemma,upos,xpos,head,deprel,start_char,end_char,y
0,اس,یہ,DET,DEM,2,det,0,2,0
1,سلسلے,سلسلہ,NOUN,NN,5,nmod,3,8,0
2,کی,کا,ADP,PSP,2,case,9,11,0
3,دیگر,دیگر,ADJ,JJ,5,amod,12,16,0
4,اقساط,اقساط,NOUN,NN,7,nsubj,17,22,0
5,یہاں,یہاں,PRON,PRP,7,obl,23,27,0
6,پڑھیے,پڑھ,VERB,VM,0,root,28,33,0
8,یہ,یہ,PRON,PRP,3,nsubj,36,38,1
9,کیسے,کیسا,PRON,WQ,3,advmod,39,43,0
10,ممکن,ممکن,ADJ,JJ,0,root,44,48,0


In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [7]:
# One-hot encode 'upos', 'xpos', and 'deprel'
encoder = OneHotEncoder(sparse_output=False)
encoded_cats = encoder.fit_transform(data[['upos', 'xpos', 'deprel']])

# Convert to DataFrame for easier merging
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out())

# Concatenate encoded features back to the dataset
data = pd.concat([data.reset_index(drop=True), encoded_cats_df], axis=1)

# Drop the original categorical columns (optional)
data = data.drop(columns=['upos', 'xpos', 'deprel'])

In [8]:
# Select the numerical features to normalize
numerical_features = ['start_char', 'end_char', 'head']

from sklearn.preprocessing import MinMaxScaler

# Option 2: Min-Max Scaling (scales features to a range, typically 0 to 1)
min_max_scaler = MinMaxScaler()
data[numerical_features] = min_max_scaler.fit_transform(data[numerical_features])

In [9]:
# Verify the result
data.head(10)

Unnamed: 0,text,lemma,head,start_char,end_char,y,upos_ADJ,upos_ADP,upos_ADV,upos_AUX,...,deprel_mark,deprel_nmod,deprel_nsubj,deprel_nummod,deprel_obj,deprel_obl,deprel_punct,deprel_root,deprel_vocative,deprel_xcomp
0,اس,یہ,0.014493,0.0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,سلسلے,سلسلہ,0.036232,3e-06,5e-06,0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,کی,کا,0.014493,8e-06,8e-06,0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,دیگر,دیگر,0.036232,1.1e-05,1.2e-05,0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,اقساط,اقساط,0.050725,1.5e-05,1.8e-05,0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,یہاں,یہاں,0.050725,2e-05,2.2e-05,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,پڑھیے,پڑھ,0.0,2.5e-05,2.7e-05,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,یہ,یہ,0.021739,3.2e-05,3.2e-05,1,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,کیسے,کیسا,0.021739,3.5e-05,3.6e-05,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,ممکن,ممکن,0.0,3.9e-05,4.1e-05,0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [10]:
from gensim.models import FastText, KeyedVectors
import numpy as np

In [11]:
from gensim.models import FastText

# Assuming the model is in the correct FastText format (.bin)
fasttext_model = FastText.load_fasttext_format('cc.ur.300.bin')





  fasttext_model = FastText.load_fasttext_format('cc.ur.300.bin')


In [12]:
def get_sentence_embedding(sentence, model):
    """
    Computes the mean FastText embedding for a sentence.
    """
    words = sentence.split()
    embeddings = []
    for word in words:
        if word in model.wv:  # Check if the word is in the vocabulary
            embeddings.append(model.wv[word])  # Access the word vector using the 'wv' attribute
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no words are found

# Combine 'text' and 'lemma' into a single column for embedding
data['text_lemma'] = data['text'] + " " + data['lemma']

# Generate FastText embeddings for each row
data['fasttext_embedding'] = data['text_lemma'].apply(lambda x: get_sentence_embedding(x, fasttext_model))

# Convert the embeddings into a numpy array
embedding_matrix = np.vstack(data['fasttext_embedding'].values)

# Drop the original text-related columns (optional)
data = data.drop(columns=['text', 'lemma', 'text_lemma', 'fasttext_embedding'])

In [13]:
# Add FastText embeddings as features
embedding_df = pd.DataFrame(embedding_matrix, columns=[f'ft_dim_{i}' for i in range(embedding_matrix.shape[1])])
data = pd.concat([data.reset_index(drop=True), embedding_df.reset_index(drop=True)], axis=1)


In [14]:
# Define the feature matrix (drop 'y') and target
X = data.drop(columns=['y'])
y = data['y']

In [15]:
# Split into training (64%), validation (16%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.36, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.56, random_state=42, stratify=y_temp)

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from sklearn import tree

# Initialize the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)  
# Fit the model on the training set
dt_model.fit(X_train, y_train)

# Predict on the validation and test sets
y_val_pred = dt_model.predict(X_val)
y_test_pred = dt_model.predict(X_test)

# Evaluate the model
print("Decision Tree - Validation Set:")
print(classification_report(y_val, y_val_pred))

print("Decision Tree - Test Set:")
print(classification_report(y_test, y_test_pred))


Decision Tree - Validation Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     36713
           1       0.76      0.55      0.64      1427

    accuracy                           0.98     38140
   macro avg       0.87      0.77      0.81     38140
weighted avg       0.97      0.98      0.97     38140

Decision Tree - Test Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     46727
           1       0.76      0.55      0.64      1817

    accuracy                           0.98     48544
   macro avg       0.87      0.77      0.81     48544
weighted avg       0.97      0.98      0.97     48544



In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf_model.fit(X_train, y_train)

# Predict on validation and test sets
y_val_pred_rf = rf_model.predict(X_val)
y_test_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest
print("Random Forest - Validation Set:")
print(classification_report(y_val, y_val_pred_rf))
print("Random Forest - Test Set:")
print(classification_report(y_test, y_test_pred_rf))

Random Forest - Validation Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     36713
           1       0.74      0.59      0.66      1427

    accuracy                           0.98     38140
   macro avg       0.86      0.79      0.82     38140
weighted avg       0.98      0.98      0.98     38140

Random Forest - Test Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     46727
           1       0.72      0.59      0.65      1817

    accuracy                           0.98     48544
   macro avg       0.85      0.79      0.82     48544
weighted avg       0.97      0.98      0.98     48544



In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [19]:
# Train the logistic regression model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate the model
y_train_pred = logistic_model.predict(X_train_resampled)
y_val_pred = logistic_model.predict(X_val)
y_test_pred = logistic_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
# Evaluate the Logistic Regression model on all datasets
print("Logistic Regression - Training Set:")
print(classification_report(y_train_resampled, y_train_pred))

print("Logistic Regression - Validation Set:")
print(classification_report(y_val, y_val_pred))

print("Logistic Regression - Test Set:")
print(classification_report(y_test, y_test_pred))

Logistic Regression - Training Set:
              precision    recall  f1-score   support

           0       0.94      0.87      0.90    148335
           1       0.88      0.94      0.91    148335

    accuracy                           0.91    296670
   macro avg       0.91      0.91      0.91    296670
weighted avg       0.91      0.91      0.91    296670

Logistic Regression - Validation Set:
              precision    recall  f1-score   support

           0       1.00      0.87      0.93     36713
           1       0.21      0.91      0.34      1427

    accuracy                           0.87     38140
   macro avg       0.60      0.89      0.63     38140
weighted avg       0.97      0.87      0.91     38140

Logistic Regression - Test Set:
              precision    recall  f1-score   support

           0       1.00      0.87      0.93     46727
           1       0.21      0.90      0.34      1817

    accuracy                           0.87     48544
   macro avg       0.6

In [21]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
# Compute class weights based on the class distribution in the target variable y
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

In [22]:
# Convert data to NumPy arrays
X_train_resampled = np.array(X_train_resampled)
y_train_resampled = np.array(y_train_resampled)
X_val = np.array(X_val)
y_val = np.array(y_val)
X_test = np.array(X_test)
y_test = np.array(y_test)


In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt




In [24]:
# Define the DNN model
model = Sequential([
    Dense(128, activation='relu', input_dim=X_train_resampled.shape[1]),  # Input layer
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),  # Hidden layer
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),  # Hidden layer
    BatchNormalization(),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Output layer (binary classification)
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])




In [25]:
# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_resampled, y_train_resampled,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    class_weight=class_weight_dict,
    callbacks=[early_stopping]
)


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Predict on test data
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Generate classification report and confusion matrix
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Test Loss: 0.5546834468841553
Test Accuracy: 0.8561922907829285
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.85      0.92     46727
           1       0.21      0.99      0.34      1817

    accuracy                           0.86     48544
   macro avg       0.60      0.92      0.63     48544
weighted avg       0.97      0.86      0.90     48544

Confusion Matrix:
 [[39759  6968]
 [   13  1804]]
