In [1]:
!which python

/usr/bin/python


# Dataset 

Picking up class 10th CBSE book "First Flight", cause its a book with a lot of content which is relevel to this project. Converting that book into a dataset and then working further upon it.

In [None]:
import PyPDF2
import re
import nltk
import pandas as pd
import os
import glob

nltk.download('punkt')

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def clean_text(text):
    text = text.strip()
    text = re.sub(r'([.?!])\s+', r'\1\n', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_sentences(text):
    return nltk.sent_tokenize(text)

pdf_directory = 'jeff1dd'
pdf_files = glob.glob(os.path.join(pdf_directory, '*.pdf'))

all_data = []

for pdf_file in pdf_files:
    chapter_name = os.path.basename(pdf_file)
    print(f"Processing {chapter_name}...")
    raw_text = extract_text_from_pdf(pdf_file)
    cleaned_text = clean_text(raw_text)
    sentences = tokenize_sentences(cleaned_text)
    
    for sentence in sentences:
        all_data.append({'chapter': chapter_name, 'sentence': sentence})

df = pd.DataFrame(all_data)
df.to_csv('compiled_chapters_dataset.csv', index=False) 
print("Dataset compiled and saved to compiled_chapters_dataset.csv")

[nltk_data] Downloading package punkt to /home/kxngh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processing jeff101.pdf...
Processing jeff102.pdf...
Processing jeff103.pdf...
Processing jeff104.pdf...
Processing jeff105.pdf...
Processing jeff106.pdf...
Processing jeff107.pdf...
Processing jeff108.pdf...
Processing jeff109.pdf...
Dataset compiled and saved to compiled_chapters_dataset.csv


In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

task_keywords = ["has to", "need to", "must", "should", "please", "kindly", "required to", "action", "task"]

def is_task_sentence(sentence):
    sentence_lower = sentence.lower()
    if any(keyword in sentence_lower for keyword in task_keywords):
        return True
    
    tokens = nltk.word_tokenize(sentence)
    if tokens:
        pos_tags = nltk.pos_tag(tokens)
        if pos_tags[0][1].startswith("VB"):
            return True
    return False

df = pd.read_csv("compiled_chapters_dataset.csv")
df['is_task'] = df['sentence'].apply(is_task_sentence)

task_df = df[df['is_task']]

task_df.to_csv("extracted_tasks_dataset.csv", index=False)

print(f"Extracted {len(task_df)} potential task sentences out of {len(df)} total sentences.")
print(task_df.head())


[nltk_data] Downloading package punkt to /home/kxngh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kxngh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Extracted 285 potential task sentences out of 2653 total sentences.
        chapter                                           sentence  is_task
1   jeff101.pdf               But what should we put our faith in?     True
6   jeff101.pdf  Think what your answers to these questions wou...     True
8   jeff101.pdf  Have you ever sent or received money in this way?     True
23  jeff101.pdf  Use a part of your pocket money, and submit th...     True
24  jeff101.pdf  See how your partner enjoys getting money by p...     True


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

df_tasks = pd.read_csv("extracted_tasks_dataset.csv")

task_sentences = df_tasks['sentence'].tolist()

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(task_sentences)

n_clusters = 4

kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X)

df_tasks['cluster'] = kmeans.labels_

for i in range(n_clusters):
    print(f"\nCluster {i} samples:")
    print(df_tasks[df_tasks['cluster'] == i]['sentence'].head(5).to_string(index=False))

df_tasks.to_csv("categorized_tasks_dataset.csv", index=False)
print("\nCategorized tasks saved to 'categorized_tasks_dataset.csv'.")


Cluster 0 samples:
•Guided them through the reading activity by pr...
Activit y: Before filling out the form, get the...
Give them enough time to read, and then discuss...
                                Let freedom reign.
At first, as a student, I wanted freedom only f...

Cluster 1 samples:
              But what should we put our faith in?
 Have you ever sent or received money in this way?
Use a part of your pocket money, and submit the...
See how your partner enjoys getting money by post!
(i)In addition to the sender , the for m has to...

Cluster 2 samples:
When he finished, he went to the window to buy ...
Join the sentences given below using who, whom ...
Given below is the passage for listening activi...
Given below are sentences carrying one part of ...
                              Does he arrive safe?

Cluster 3 samples:
Think what your answers to these questions woul...
Use the ‘Oral Comprehension Checks’ in the appr...
Check your guess with this news item (from the ...
Ma

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
import pandas as pd
import nltk
import re

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

task_keywords = ["has to", "need to", "must", "should", "please", "kindly", "required to", "action", "task"]

def is_task_sentence(sentence):
    sentence_lower = sentence.lower()
    if any(keyword in sentence_lower for keyword in task_keywords):
        return True

    tokens = nltk.word_tokenize(sentence)
    if tokens:
        pos_tags = nltk.pos_tag(tokens)
        if pos_tags[0][1].startswith("VB"):
            return True

    return False

def extract_agent(sentence):
    tokens = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)
    for word, tag in pos_tags:
        if tag in ['NNP', 'NNPS']:
            return word
    return None

def extract_deadline(sentence):
    time_pattern = r'\bby\s+(\d{1,2}(?::\d{2})?\s*(am|pm))\b'
    match = re.search(time_pattern, sentence, flags=re.IGNORECASE)
    if match:
        return match.group(0)
    
    deadline_keywords = ["tomorrow", "today", "next week", "by the end of day"]
    for keyword in deadline_keywords:
        if keyword in sentence.lower():
            return keyword
    return None

df = pd.read_csv("compiled_chapters_dataset.csv")

def process_sentence(row):
    sentence = row['sentence']
    task_flag = is_task_sentence(sentence)
    agent = extract_agent(sentence) if task_flag else None
    deadline = extract_deadline(sentence) if task_flag else None
    return pd.Series({'is_task': task_flag, 'agent': agent, 'deadline': deadline})

task_details = df.apply(process_sentence, axis=1)
df = pd.concat([df, task_details], axis=1)

df_tasks = df[df['is_task'] == True].copy()

output_csv = "extracted_tasks_with_details.csv"
df_tasks.to_csv(output_csv, index=False)

print(f"Processed {len(df)} sentences. Found {len(df_tasks)} potential task sentences.")
print("Sample output:")
print(df_tasks.head())


[nltk_data] Downloading package punkt to /home/kxngh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kxngh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/kxngh/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/kxngh/nltk_data...
[nltk_data]   Package words is already up-to-date!


Processed 2653 sentences. Found 285 potential task sentences.
Sample output:
        chapter                                           sentence  is_task  \
1   jeff101.pdf               But what should we put our faith in?     True   
6   jeff101.pdf  Think what your answers to these questions wou...     True   
8   jeff101.pdf  Have you ever sent or received money in this way?     True   
23  jeff101.pdf  Use a part of your pocket money, and submit th...     True   
24  jeff101.pdf  See how your partner enjoys getting money by p...     True   

   agent deadline  
1   None     None  
6   None     None  
8   None     None  
23  None     None  
24  None     None  


In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel

nltk.download('stopwords')
stop_words = stopwords.words('english')

df_tasks = pd.read_csv("extracted_tasks_with_details.csv")
task_sentences = df_tasks['sentence'].tolist()

def preprocess_text(text):
    tokens = gensim.utils.simple_preprocess(text, deacc=True)
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

processed_docs = [preprocess_text(sentence) for sentence in task_sentences]

dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

num_topics = 4
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, 
                     random_state=42, passes=10)

for idx, topic in lda_model.print_topics(num_words=5):
    print(f"Topic {idx}: {topic}")

def assign_topic(text):
    tokens = preprocess_text(text)
    bow = dictionary.doc2bow(tokens)
    topic_probs = lda_model.get_document_topics(bow)
    if topic_probs:
        top_topic = max(topic_probs, key=lambda x: x[1])[0]
        return top_topic
    return None

df_tasks['category'] = df_tasks['sentence'].apply(assign_topic)

output_csv = "categorized_tasks_with_LDA.csv"
df_tasks.to_csv(output_csv, index=False)

print(f"Categorized tasks saved to {output_csv}")


[nltk_data] Downloading package stopwords to /home/kxngh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Topic 0: 0.007*"say" + 0.007*"tickets" + 0.007*"read" + 0.006*"students" + 0.006*"tiger"
Topic 1: 0.009*"something" + 0.008*"know" + 0.008*"said" + 0.006*"please" + 0.006*"given"
Topic 2: 0.016*"must" + 0.007*"bus" + 0.007*"get" + 0.005*"reprint" + 0.005*"please"
Topic 3: 0.012*"read" + 0.009*"think" + 0.009*"must" + 0.005*"story" + 0.005*"one"
Categorized tasks saved to categorized_tasks_with_LDA.csv


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

df_tasks = pd.read_csv("extracted_tasks_with_details.csv")
task_sentences = df_tasks['sentence'].tolist()

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(task_sentences)

n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(embeddings)
labels = kmeans.labels_

df_tasks['category'] = labels

for i in range(n_clusters):
    print(f"\nCluster {i} sample sentences:")
    print(df_tasks[df_tasks['category'] == i]['sentence'].head(5).to_string(index=False))
    print("\n")

output_csv = "categorized_tasks_with_bert_4clusters.csv"
df_tasks.to_csv(output_csv, index=False)
print(f"Categorized tasks saved to '{output_csv}'")


  super()._check_params_vs_input(X, default_n_init=10)



Cluster 0 sample sentences:
              But what should we put our faith in?
Think what your answers to these questions woul...
 Have you ever sent or received money in this way?
Use a part of your pocket money, and submit the...
See how your partner enjoys getting money by post!



Cluster 1 sample sentences:
  Do you think a crow is often mentioned in poems?
Read the poem silently once, and say which stan...
He should be lurking in shadow, Sliding through...
He should be snarling around houses At the jung...
LESLIE NORRIS Reprint 2024-25 snarls: makes an ...



Cluster 2 sample sentences:
Join the sentences given below using who, whom ...
IV.Using Negatives for Emphasis We know that se...
     Try to say what qualities are being compared.
•Guided them through the reading activity by pr...
•Provided interesting exercises to strengthen s...



Cluster 3 sample sentences:
Have you experienced a similar moment that chan...
                                Let freedom reign.
           

Cluster 0:
This cluster appears to group sentences that are reflective or discussion-oriented. They often pose questions (e.g., “But what should we put our faith in?”) or prompt the reader to think (e.g., “Think what your answers to these questions would be…”). These might be more about prompting discussion or reflection rather than direct action commands.

Cluster 1:
The sentences here include descriptive and creative language (“He should be lurking in shadow, sliding through…” and “He should be snarling around houses…”) along with literary analysis questions (e.g., “Do you think a crow is often mentioned in poems?”). This cluster could represent tasks that involve creative description or critical analysis of text.

Cluster 2:
This group contains more structured instructions or directives—for example, tasks like “Join the sentences given below using who, whom…” and directives related to language exercises (“Try to say what qualities are being compared…”). It seems to capture more formal, exercise-like task instructions.

Cluster 3:
The sentences here are short, direct, and imperative (e.g., “Choose the right answer.”, “Make it as humorous as possible.”). They are clear commands or directives and could be seen as straightforward task instructions that require immediate action.

# Part B

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

class AdvancedTextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
    
    def preprocess(self, text):
        if not isinstance(text, str):
            return ""
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        tokens = word_tokenize(text)
        cleaned_tokens = [
            self.lemmatizer.lemmatize(token) 
            for token in tokens 
            if token not in self.stop_words and len(token) > 1
        ]
        return ' '.join(cleaned_tokens)

df = pd.read_csv("csv_data.csv")

preprocessor = AdvancedTextPreprocessor()
df['clean_review'] = df['review'].apply(preprocessor.preprocess)
df['sentiment'] = df['sentiment'].str.strip().str.lower()
df['sentiment'] = df['sentiment'].replace({"postive": "positive"})

le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment'])

X = df['clean_review']
y = df['sentiment_encoded']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

base_classifiers = [
    ('lr', LogisticRegression(max_iter=1000, class_weight='balanced', C=0.1)),
    ('svm', SVC(probability=True, class_weight='balanced', kernel='rbf')),
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, class_weight='balanced')),
    ('gb', GradientBoostingClassifier(n_estimators=250, learning_rate=0.05, max_depth=5))
]

stacking_classifier = StackingClassifier(
    estimators=base_classifiers,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5
)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english', 
        ngram_range=(1,3),
        max_features=8000,
        sublinear_tf=True,
        max_df=0.8,
        min_df=2
    )),
    ('classifier', stacking_classifier)
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='f1')

print("Cross-validation F1 Scores:", cv_scores)
print("Mean CV F1 Score:", cv_scores.mean())

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

y_test_labels = le.inverse_transform(y_test)
y_pred_labels = le.inverse_transform(y_pred)

print("\nDetailed Evaluation Metrics:")
print(f"Accuracy: {accuracy_score(y_test_labels, y_pred_labels):.4f}")
print(f"Precision: {precision_score(y_test_labels, y_pred_labels, pos_label='positive'):.4f}")
print(f"Recall: {recall_score(y_test_labels, y_pred_labels, pos_label='positive'):.4f}")
print(f"F1 Score: {f1_score(y_test_labels, y_pred_labels, pos_label='positive'):.4f}")

print("\nClassification Report:")
print(classification_report(y_test_labels, y_pred_labels))

def bootstrap_metric(y_true, y_pred, metric_func, n_iterations=1000, alpha=0.05):
    metrics = []
    for _ in range(n_iterations):
        indices = np.random.randint(0, len(y_true), len(y_true))
        sample_true = y_true[indices]
        sample_pred = y_pred[indices]
        metrics.append(metric_func(sample_true, sample_pred))
    confidence_interval = np.percentile(metrics, [alpha/2*100, (1-alpha/2)*100])
    return np.mean(metrics), confidence_interval

accuracy_mean, accuracy_ci = bootstrap_metric(
    np.array(y_test_labels), 
    np.array(y_pred_labels), 
    accuracy_score
)
precision_mean, precision_ci = bootstrap_metric(
    np.array(y_test_labels), 
    np.array(y_pred_labels), 
    lambda y_t, y_p: precision_score(y_t, y_p, pos_label='positive')
)
recall_mean, recall_ci = bootstrap_metric(
    np.array(y_test_labels), 
    np.array(y_pred_labels), 
    lambda y_t, y_p: recall_score(y_t, y_p, pos_label='positive')
)

print("\nBootstrap Confidence Intervals:")
print(f"Accuracy: {accuracy_mean:.4f} (95% CI: {accuracy_ci})")
print(f"Precision: {precision_mean:.4f} (95% CI: {precision_ci})")
print(f"Recall: {recall_mean:.4f} (95% CI: {recall_ci})")


Cross-validation F1 Scores: [0.84848485 0.79020979 0.7766323  0.80565371 0.82068966]
Mean CV F1 Score: 0.8083340613039802

Detailed Evaluation Metrics:
Accuracy: 0.9067
Precision: 0.9321
Recall: 0.7947
F1 Score: 0.8580

Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.97      0.93       346
    positive       0.93      0.79      0.86       190

    accuracy                           0.91       536
   macro avg       0.91      0.88      0.89       536
weighted avg       0.91      0.91      0.90       536


Bootstrap Confidence Intervals:
Accuracy: 0.9073 (95% CI: [0.88059701 0.93283582])
Precision: 0.9312 (95% CI: [0.89022806 0.96987952])
Recall: 0.7924 (95% CI: [0.73429309 0.84537166])


In [3]:
# Save positive and negative clusters into separate CSV files
positive_reviews = df[df['sentiment_encoded'] == le.transform(['positive'])[0]]
negative_reviews = df[df['sentiment_encoded'] == le.transform(['negative'])[0]]

positive_reviews.to_csv("positive_reviews.csv", index=False)
negative_reviews.to_csv("negative_reviews.csv", index=False)

print("Clusters saved: positive_reviews.csv and negative_reviews.csv")


Clusters saved: positive_reviews.csv and negative_reviews.csv
