# Task-A

In [30]:
import spacy
import dateparser
from datetime import datetime, timedelta
import os

# Load the spaCy language model for English (small model)
nlp = spacy.load("en_core_web_sm")

# Define a set of task-related keywords
TASK_KEYWORDS = {"must", "should", "has to", "have to", "needs to", "is required to"}

# Date Parsing Module

def parse_date(date_text):
    """
    Parses date text and returns a formatted date string.
    
    This function handles:
      - Relative terms like "today" and "tomorrow".
      - Weekday names (e.g., "next Friday") by computing the next occurrence.
      - Other relative date expressions using dateparser.
    
    Parameters:
        date_text (str): The date string extracted from text.
    
    Returns:
        str or None: A formatted date string (or day name) if parsing is successful;
                     otherwise, None.
    """
    # Check for simple relative dates
    if "today" in date_text.lower():
        return "today"
    elif "tomorrow" in date_text.lower():
        return "tomorrow"
    
    # Mapping for week days
    week_days = {
        "monday": 0, "tuesday": 1, "wednesday": 2, "thursday": 3,
        "friday": 4, "saturday": 5, "sunday": 6
    }
    
    # Look for any weekday mention in the date text
    for day in week_days:
        if day in date_text.lower():
            today = datetime.today()
            target_day = week_days[day]
            days_ahead = target_day - today.weekday()
            if days_ahead <= 0:  # If the day is today or in the past, select next week's occurrence
                days_ahead += 7
            next_weekday = today + timedelta(days=days_ahead)
            return next_weekday.strftime('%A')  # Return the day name (e.g., 'Monday')
    
    # For other relative dates, use dateparser with settings favoring future dates
    parsed_date = dateparser.parse(date_text, settings={'PREFER_DATES_FROM': 'future', 'RELATIVE_BASE': datetime.today()})
    if parsed_date:
        return parsed_date.strftime('%Y-%m-%d %H:%M:%S')
    return None

# Task Extraction Module

def extract_task_details(sentences):
    """
    Extracts task details (responsible entity, deadline, and category) from a list of sentences.
    
    The function processes each sentence to determine if it represents a task based on the presence
    of predefined keywords. It then uses spaCy's named entity recognition to extract:
      - 'who' is responsible for the task (e.g., a person or organization).
      - The deadline by searching for date/time entities.
      - The task category via keyword matching.
    
    Parameters:
        sentences (list of str): List of sentences from the text.
    
    Returns:
        list of dict: Each dictionary contains details for a task:
                      {
                          "task": original task sentence,
                          "who": responsible person or organization,
                          "deadline": extracted deadline or "no deadline",
                          "category": assigned category of the task
                      }
    """
    tasks = []
    
    for sentence in sentences:
        # Skip empty sentences after stripping whitespace
        if not sentence.strip():
            continue
        
        # Process sentence with spaCy NLP model
        doc = nlp(sentence)
        task_info = {
            "task": sentence.strip(),
            "who": None,
            "deadline": "no deadline",
            "category": None
        }
        
        # Determine if the sentence likely represents a task using task keywords
        if any(keyword in sentence.lower() for keyword in TASK_KEYWORDS):
            # Attempt to extract the responsible entity (e.g., a PERSON or ORG)
            for ent in doc.ents:
                if ent.label_ in {"PERSON", "ORG"}:
                    task_info["who"] = ent.text
                    break
            
            # If no entity is detected, try to extract the subject manually using dependency parsing
            if not task_info["who"]:
                for token in doc:
                    if token.dep_ == "nsubj":
                        task_info["who"] = token.text
                        break
            
            # Extract deadline information from DATE or TIME entities
            for ent in doc.ents:
                if ent.label_ in {"DATE", "TIME"}:
                    parsed_date = parse_date(ent.text)
                    if parsed_date:
                        task_info["deadline"] = parsed_date
                    break  # Use the first encountered date/time entity
            
            # Categorize the task based on action words present in the sentence
            task_info["category"] = categorize_task(task_info["task"])
            
            # Append the task details to the results list
            tasks.append(task_info)
    
    return tasks


# Task Categorization Module

def categorize_task(task):
    """
    Assigns a category to the task based on action keywords found in the task description.
    
    Supported categories include:
      - Shopping, Cleaning, Work, Appointments.
    
    If no specific keywords are found, the task is assigned to the "General" category.
    
    Parameters:
        task (str): The task description.
    
    Returns:
        str: The category name.
    """
    categories = {
        "Shopping": ["buy", "purchase", "snack"],
        "Cleaning": ["clean", "sweep", "wash", "cleaning"],
        "Work": ["submit", "review", "write", "send", "report"],
        "Appointments": ["meet", "call", "schedule", "appointment"]
    }
    
    # Loop over each category and check if any keyword is present in the task
    for category, keywords in categories.items():
        if any(word in task.lower() for word in keywords):
            return category
    return "General"


# File Reading and Processing Module

def read_file_and_process(file_path):
    """
    Reads text data from a file, splits it into sentences, and extracts task details.
    
    Parameters:
        file_path (str): Path to the input text file.
    
    Returns:
        list of dict: A list of task details dictionaries extracted from the text.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Rudimentary sentence splitting (could be replaced with a more sophisticated method)
    sentences = text.split('.')
    
    # Extract task details from the list of sentences
    tasks = extract_task_details(sentences)
    return tasks

# Main Pipeline Runner

def main():
    """
    Main function to run the task extraction pipeline.
    
    The function performs the following steps:
      1. Checks for the existence of an input file at the specified path.
      2. If found, processes the file to extract task details.
      3. If not found, uses a manually curated sample text.
      4. Outputs the extracted tasks.
      5. Displays insights and challenges encountered during development.
    """
    # Define the file path (update this path as needed)
    file_path = "C:/Users/asus/Desktop/Text_Extractor/Extract.txt"
    
    # Check if the file exists; if not, use a manually curated sample
    if os.path.exists(file_path):
        tasks = read_file_and_process(file_path)
        print(f"Extracted {len(tasks)} tasks from file '{file_path}':\n")
    else:
        print(f"File not found: {file_path}. Using a manual sample instead.\n")
        sample_text = (
            "John must submit the report by next Friday. "
            "Mary should schedule an appointment for tomorrow. "
            "Alex has to buy groceries today. "
            "The team needs to review the project proposal next Monday."
        )
        # Split the sample text into sentences
        sentences = sample_text.split('.')
        tasks = extract_task_details(sentences)
    
    # Output the extracted tasks
    for task in tasks:
        print(task)
   
    

# Execute the main pipeline if this script is run directly
if __name__ == '__main__':
    main()


Extracted 3 tasks from file 'C:/Users/asus/Desktop/Text_Extractor/Extract.txt':

{'task': 'hn must complete the annual budget review by next Friday', 'who': 'hn', 'deadline': 'Friday', 'category': 'Work'}
{'task': 'Mary has to schedule the project kickoff meeting for tomorrow afternoon', 'who': 'Mary', 'deadline': 'tomorrow', 'category': 'Appointments'}
{'task': 'Alex is required to finalize the vendor contract negotiations before the client event next Monday', 'who': 'Alex', 'deadline': 'Monday', 'category': 'General'}


In [36]:
import spacy
import dateparser
from datetime import datetime, timedelta
import os

# Load the spaCy language model for English (small model)
nlp = spacy.load("en_core_web_sm")

# Define a set of task-related keywords
TASK_KEYWORDS = {"must", "should", "has to", "have to", "needs to", "is required to"}

# Date Parsing Module
def parse_date(date_text):
    """
    Parses date text and returns a textual representation of the date.
    
    Instead of computing the actual date, this function returns a text string
    that reflects the relative date mentioned (e.g., "today", "tomorrow", "next week", "next weekend", or specific weekday names).
    
    Parameters:
        date_text (str): The date string extracted from text.
    
    Returns:
        str: A textual representation of the date if parsing is successful;
             otherwise, returns the original text stripped.
    """
    lower_text = date_text.lower().strip()
    
    # Check for simple relative dates
    if "today" in lower_text:
        return "today"
    elif "tomorrow" in lower_text:
        return "tomorrow"
    
    # Check for phrases with relative indicators in proper order:
    if "next weekend" in lower_text:
        return "next weekend"
    if "next week" in lower_text:
        return "next week"
    if "weekend" in lower_text:
        return "weekend"
    
    # Check for weekday names
    week_days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
    for day in week_days:
        if day in lower_text:
            if "next" in lower_text:
                return "next " + day.capitalize()
            else:
                return day.capitalize()
    
    # For other relative dates, simply return the original text
    return date_text.strip()

# Task Extraction Module
def extract_task_details(sentences):
    """
    Extracts task details (responsible entity, deadline, and category) from a list of sentences.
    
    The function processes each sentence to determine if it represents a task based on the presence
    of predefined keywords. It then uses spaCy's named entity recognition to extract:
      - 'who' is responsible for the task (e.g., a person or organization).
      - The deadline by searching for date/time entities.
      - The task category via keyword matching.
    
    Parameters:
        sentences (list of str): List of sentences from the text.
    
    Returns:
        list of dict: Each dictionary contains details for a task:
                      {
                          "task": original task sentence,
                          "who": responsible person or organization,
                          "deadline": extracted deadline or "no deadline",
                          "category": assigned category of the task
                      }
    """
    tasks = []
    
    for sentence in sentences:
        # Skip empty sentences after stripping whitespace
        if not sentence.strip():
            continue
        
        # Process sentence with spaCy NLP model
        doc = nlp(sentence)
        task_info = {
            "task": sentence.strip(),
            "who": None,
            "deadline": "no deadline",
            "category": None
        }
        
        # Determine if the sentence likely represents a task using task keywords
        if any(keyword in sentence.lower() for keyword in TASK_KEYWORDS):
            # Attempt to extract the responsible entity (e.g., a PERSON or ORG)
            for ent in doc.ents:
                if ent.label_ in {"PERSON", "ORG"}:
                    task_info["who"] = ent.text
                    break
            
            # If no entity is detected, try to extract the subject manually using dependency parsing
            if not task_info["who"]:
                for token in doc:
                    if token.dep_ == "nsubj":
                        task_info["who"] = token.text
                        break
            
            # Extract deadline information from DATE or TIME entities
            for ent in doc.ents:
                if ent.label_ in {"DATE", "TIME"}:
                    parsed_date = parse_date(ent.text)
                    if parsed_date:
                        task_info["deadline"] = parsed_date
                    break  # Use the first encountered date/time entity
            
            # Categorize the task based on action words present in the sentence
            task_info["category"] = categorize_task(task_info["task"])
            
            # Append the task details to the results list
            tasks.append(task_info)
    
    return tasks

# Task Categorization Module
def categorize_task(task):
    """
    Assigns a category to the task based on action keywords found in the task description.
    
    Supported categories include:
      - Shopping, Cleaning, Work, Appointments.
    
    If no specific keywords are found, the task is assigned to the "General" category.
    
    Parameters:
        task (str): The task description.
    
    Returns:
        str: The category name.
    """
    categories = {
        "Shopping": ["buy", "purchase", "snack"],
        "Cleaning": ["clean", "sweep", "wash", "cleaning"],
        "Work": ["submit", "review", "write", "send", "report"],
        "Appointments": ["meet", "call", "schedule", "appointment"]
    }
    
    # Loop over each category and check if any keyword is present in the task
    for category, keywords in categories.items():
        if any(word in task.lower() for word in keywords):
            return category
    return "General"

# File Reading and Processing Module
def read_file_and_process(file_path):
    """
    Reads text data from a file, splits it into sentences, and extracts task details.
    
    Parameters:
        file_path (str): Path to the input text file.
    
    Returns:
        list of dict: A list of task details dictionaries extracted from the text.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Rudimentary sentence splitting (could be replaced with a more sophisticated method)
    sentences = text.split('.')
    
    # Extract task details from the list of sentences
    tasks = extract_task_details(sentences)
    return tasks

# Main Pipeline Runner
def main():
    """
    Main function to run the task extraction pipeline.
    
    The function performs the following steps:
      1. Checks for the existence of an input file at the specified path.
      2. If found, processes the file to extract task details.
      3. If not found, uses a manually curated sample text.
      4. Outputs the extracted tasks.
      5. Displays insights and challenges encountered during development.
    """
    # Define the file path (update this path as needed)
    file_path = "C:/Users/asus/Desktop/Text_Extractor/Extract.txt"
    
    # Check if the file exists; if not, use a manually curated sample
    if os.path.exists(file_path):
        tasks = read_file_and_process(file_path)
        print(f"Extracted {len(tasks)} tasks from file '{file_path}':\n")
    else:
        print(f"File not found: {file_path}. Using a manual sample instead.\n")
        sample_text = (
            "Alex is required to finalize the vendor contract negotiations before the client event next Weekend. "
            "John must submit the report by next Friday. "
            "Mary should schedule an appointment for tomorrow."
        )
        # Split the sample text into sentences
        sentences = sample_text.split('.')
        tasks = extract_task_details(sentences)
    
    # Output the extracted tasks
    for task in tasks:
        print(task)
   
# Execute the main pipeline if this script is run directly
if __name__ == '__main__':
    main()


Extracted 3 tasks from file 'C:/Users/asus/Desktop/Text_Extractor/Extract.txt':

{'task': 'john must complete the annual budget review by next Friday', 'who': 'john', 'deadline': 'next Friday', 'category': 'Work'}
{'task': 'Mary has to schedule the project kickoff meeting for tomorrow afternoon', 'who': 'Mary', 'deadline': 'tomorrow', 'category': 'Appointments'}
{'task': 'Alex is required to finalize the vendor contract negotiations before the client event next Weekend', 'who': 'Alex', 'deadline': 'next weekend', 'category': 'General'}


## Insights and Challenges
    
    
    1. Handling varied date formats and relative date expressions required careful design.")
    2. Extracting the responsible entity ('who') is challenging when sentences lack clear named entities.")
    3. Basic sentence splitting by periods may fail for more complex text; using advanced segmentation could improve accuracy.")
    4. Categorizing tasks using simple keyword matching is effective for basic scenarios but may miss nuanced or multi-faceted tasks.")
    5. spaCy's language model might occasionally miss context-specific cues, affecting both entity recognition and dependency parsing.")

## Task-B

In [14]:
# Import necessary libraries
import pandas as pd
import re
import nltk
nltk.download('stopwords')  # Ensure stopwords are available
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Preload stopwords and compile regex once for efficiency
STOP_WORDS = set(stopwords.words('english'))
CLEAN_PATTERN = re.compile(r'[^a-z\s]')


### Preprocessing

In [16]:
def preprocess_text(text):
    """
    Clean a single text string by:
      - converting to lowercase,
      - removing punctuation and digits,
      - tokenizing and removing stopwords.
    Returns the cleaned text.
    """
    text = text.lower()                              # Convert to lowercase
    text = CLEAN_PATTERN.sub('', text)               # Remove digits/punctuation
    tokens = text.split()                            # Tokenize by whitespace
    tokens = [word for word in tokens if word not in STOP_WORDS]  # Remove stopwords
    return ' '.join(tokens)

def preprocess_reviews(df, text_column='review'):
    """
    Preprocess all reviews in the DataFrame.
    
    Parameters:
      df          : Pandas DataFrame containing reviews.
      text_column : Column name containing raw text.
      
    Returns:
      df with an added 'clean_review' column.
    """
    df['clean_review'] = df[text_column].apply(preprocess_text)
    return df

### Feature Extraction Module

In [17]:
def extract_features(texts, max_features=5000):
    """
    Convert text data into numerical features using TF-IDF.
    
    Parameters:
      texts        : Iterable of cleaned text strings.
      max_features : Maximum number of features to consider.
      
    Returns:
      vectorizer : Fitted TfidfVectorizer.
      features   : Transformed text data as a sparse matrix.
    """
    vectorizer = TfidfVectorizer(max_features=max_features)
    features = vectorizer.fit_transform(texts)
    return vectorizer, features

### Categorization (Model Training) Module

In [18]:
def train_classifier(X, y, model_type='logistic', max_iter=1000):
    """
    Train a classifier to categorize reviews into positive or negative sentiment.
    
    Parameters:
      X          : Feature matrix.
      y          : Target labels.
      model_type : Type of model to use ('logistic' currently supported).
      max_iter   : Maximum iterations for model training.
      
    Returns:
      A trained model.
    """
    if model_type == 'logistic':
        model = LogisticRegression(max_iter=max_iter)
    else:
        raise ValueError("Unsupported model type. Try 'logistic'.")
    model.fit(X, y)
    return model

def evaluate_classifier(model, X_test, y_test):
    """
    Evaluate the trained classifier on test data.
    Prints a classification report along with accuracy, precision, and recall.
    
    Parameters:
      model   : Trained classifier.
      X_test  : Test features.
      y_test  : Test labels.
      
    Returns:
      Predictions made on X_test.
    """
    y_pred = model.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    return y_pred

### Pipeline Runner

In [19]:
def run_pipeline(data_path, text_column='review', label_column='sentiment'):
    """
    End-to-end pipeline that:
      - Loads the dataset.
      - Preprocesses text data.
      - Extracts features using TF-IDF.
      - Splits data into training and test sets.
      - Trains a classifier.
      - Evaluates the classifier.
    
    Parameters:
      data_path   : Path to the CSV dataset.
      text_column : Name of the column with reviews.
      label_column: Name of the column with sentiment labels.
    
    Returns:
      model      : Trained classification model.
      vectorizer : Fitted TF-IDF vectorizer.
      df         : Processed DataFrame.
    """
    # Load the dataset
    df = pd.read_csv(data_path)
    
    # Convert sentiment labels to binary values (assuming the dataset uses 'positive' and 'negative')
    df[label_column] = df[label_column].map({'positive': 1, 'negative': 0})
    
    # Preprocess the reviews
    df = preprocess_reviews(df, text_column=text_column)
    
    # Extract features from the cleaned reviews
    vectorizer, features = extract_features(df['clean_review'])
    
    # Split data into training and testing sets (80/20 split)
    X_train, X_test, y_train, y_test = train_test_split(features, df[label_column], test_size=0.2, random_state=42)
    
    # Train the classifier
    model = train_classifier(X_train, y_train)
    
    # Evaluate the classifier
    evaluate_classifier(model, X_test, y_test)
    
    return model, vectorizer, df

### Manual Validation Module

In [20]:
def validate_manual_sample(model, vectorizer):
    """
    Validate the classifier with a small, manually curated sample of reviews.
    
    Parameters:
      model      : Trained classifier.
      vectorizer : Fitted TF-IDF vectorizer.
    
    Prints the original review and its predicted sentiment.
    """
    # Manually curated sample reviews
    sample_reviews = [
        "I absolutely loved this movie, it was fantastic!",
        "This product is terrible, I will never buy it again.",
        "What a wonderful book, it warmed my heart.",
        "Awful service, I'm very disappointed."
    ]
    
    # Preprocess the sample reviews
    cleaned_samples = [preprocess_text(review) for review in sample_reviews]
    
    # Convert the cleaned text to features using the already fitted vectorizer
    sample_features = vectorizer.transform(cleaned_samples)
    
    # Predict sentiment for the sample reviews
    predictions = model.predict(sample_features)
    
    # Map numeric predictions back to sentiment labels
    sentiment_mapping = {1: "positive", 0: "negative"}
    
    print("\nManual Validation on Curated Samples:")
    for review, pred in zip(sample_reviews, predictions):
        print("Review: ", review)
        print("Predicted Sentiment: ", sentiment_mapping[pred])
        print("-" * 50)

### Main Function to Run the Entire Pipeline

In [38]:
def main():
    # File path to the dataset (update as needed)
    data_path = 'C:/Users/asus/Desktop/Text_Extractor/imdb_reviews.csv'
    
    # Run the complete pipeline: load data, preprocess, extract features, train, and evaluate
    model, vectorizer, df = run_pipeline(data_path)
    
    # Validate the model with a manually curated sample
    validate_manual_sample(model, vectorizer)
    
    

# Run the main function if this script is executed
if __name__ == '__main__':
    main()

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.87      0.88      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Accuracy: 0.8872
Precision: 0.8778743961352657
Recall: 0.901567771383211

Manual Validation on Curated Samples:
Review:  I absolutely loved this movie, it was fantastic!
Predicted Sentiment:  positive
--------------------------------------------------
Review:  This product is terrible, I will never buy it again.
Predicted Sentiment:  negative
--------------------------------------------------
Review:  An average experience, nothing particularly memorable.
Predicted Sentiment:  negative
--------------------------------------------------
Review:  What a wonderful book, it warmed my heart.
Predicted Sentiment:  positive
-------------------------

### Insights and Challenges faced during the task:
    Insights and Challenges Faced
    1. Preprocessing needed careful balancing: Removing too much can lead to loss of sentiment cues.
    2. The TF-IDF vectorization process is computationally intensive on large datasets; limiting features is crucial.
    3. Tuning model hyperparameters is necessary for better performance.
    4. Manual validation is important to ensure that the model generalizes well to unseen data.