# scikit-learn: Working With Text Data

## Exersice 1

In [None]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Custom preprocessor function to clean the text before vectorization
# Here, we convert the text to lowercase for normalization
# Additional preprocessing steps can be added as needed
def custom_preprocessor(text):
    # Convert text to lowercase to ensure case-insensitivity
    text = text.lower()
    # Remove non-alphabetic characters if needed (optional)
    return text

# Function to load the dataset from the specified directory
# Assumes that there are subdirectories for each language, with text files inside each subdirectory
# Each subdirectory's name is treated as the label (i.e., language)
def load_data(directory):
    texts, labels = [], []  # Initialize empty lists to store texts and labels
    for label in os.listdir(directory):  # Loop through each subdirectory (each representing a language)
        label_dir = os.path.join(directory, label)  # Path to the subdirectory
        if os.path.isdir(label_dir):  # Check if the path is indeed a directory
            for filename in os.listdir(label_dir):  # Loop through each file in the subdirectory
                filepath = os.path.join(label_dir, filename)  # Full path to the file
                with open(filepath, 'r', encoding='utf-8') as f:  # Open the file
                    texts.append(f.read())  # Append the file content (text) to the texts list
                    labels.append(label)  # Append the label (language) to the labels list
    return texts, labels  # Return the collected texts and labels

# Specify the directory where the dataset is located
# Adjust the path as per your folder structure
data_dir = 'data/languages/paragraphs/'  # Path to the directory containing language data

# Load the dataset into two lists: texts (the text data) and labels (the corresponding language labels)
texts, labels = load_data(data_dir)

# Split the data into training and test sets
# We use 80% of the data for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Build the text classification pipeline
# This pipeline consists of two main components:
# 1. TfidfVectorizer: Converts the text into numerical features using character-based n-grams
# 2. SGDClassifier: A linear Support Vector Machine (SVM) classifier that is suitable for text classification
text_clf = Pipeline([
    # TfidfVectorizer:
    # - analyzer='char': This means we are using character-level n-grams
    # - ngram_range=(3, 5): We use n-grams of lengths 3 to 5 (this captures character sequences of different lengths)
    # - preprocessor=custom_preprocessor: Use the custom preprocessor function to clean the text
    ('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(3, 5), preprocessor=custom_preprocessor)),
    
    # SGDClassifier:
    # - loss='hinge': This specifies the hinge loss function, commonly used for SVM
    # - penalty='l2': L2 regularization to prevent overfitting
    # - alpha=1e-3: Regularization strength (smaller values mean stronger regularization)
    # - max_iter=5, tol=None: Maximum number of iterations for training, and no tolerance for stopping early
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),
])

# Train the model by fitting the pipeline on the training data (X_train, y_train)
text_clf.fit(X_train, y_train)

# Use the trained model to predict the labels of the test data (X_test)
y_pred = text_clf.predict(X_test)

# Evaluate the performance of the model using classification metrics
# - classification_report: This generates precision, recall, and F1-score for each class (language)
# - confusion_matrix: This generates a confusion matrix to show misclassifications
print(metrics.classification_report(y_test, y_pred))  # Detailed performance metrics
print(metrics.confusion_matrix(y_test, y_pred))  # Confusion matrix showing actual vs predicted labels


## Exercise 2

In [None]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

# Function to load movie review data from the given directory
# Assumes there are two subdirectories: 'pos' for positive reviews and 'neg' for negative reviews
# Each subdirectory contains multiple text files, each representing a single review
def load_data(directory):
    texts, labels = [], []  # Initialize empty lists to store texts and their corresponding labels
    for label in ['pos', 'neg']:  # We assume the dataset has 'pos' and 'neg' subdirectories
        label_dir = os.path.join(directory, label)  # Get the full path to each subdirectory
        for filename in os.listdir(label_dir):  # Loop through each file in the subdirectory
            filepath = os.path.join(label_dir, filename)  # Full path to the text file
            with open(filepath, 'r', encoding='utf-8') as f:  # Open the file with UTF-8 encoding
                texts.append(f.read())  # Read the content of the file and append to texts list
                labels.append(label)  # Append 'pos' or 'neg' as the label for this review
    return texts, labels  # Return the list of texts and their labels

# Specify the directory where the movie review dataset is located
# The path should point to the folder containing 'pos' and 'neg' subdirectories
data_dir = 'data/movie_reviews/txt_sentoken/'  # Adjust this path based on your folder structure

# Load the movie review dataset into two lists: 
# - texts: contains the review text from each file
# - labels: contains the corresponding label ('pos' or 'neg') for each review
texts, labels = load_data(data_dir)

# Split the dataset into training and testing sets
# - X_train: training data (80% of the reviews)
# - X_test: testing data (20% of the reviews)
# - y_train: training labels (80% corresponding to X_train)
# - y_test: testing labels (20% corresponding to X_test)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Build a text classification pipeline
# - TfidfVectorizer: Converts text data into numerical features using Term Frequency-Inverse Document Frequency (TF-IDF)
# - SGDClassifier: A linear SVM classifier trained using Stochastic Gradient Descent (SGD)
text_clf = Pipeline([
    # TfidfVectorizer:
    # - stop_words='english': Removes common English stop words (e.g., 'the', 'is', 'and')
    # - This transforms the review texts into numerical vectors
    ('tfidf', TfidfVectorizer(stop_words='english')),
    
    # SGDClassifier:
    # - A linear SVM classifier that uses the hinge loss function
    # - random_state: Seed to ensure reproducibility of results
    # - max_iter=1000: Allows the classifier to iterate up to 1000 times over the data
    # - tol=1e-3: The tolerance for stopping criteria (stops training once the model converges)
    ('clf', SGDClassifier(random_state=42, max_iter=1000, tol=1e-3)),
])

# Perform grid search to find the best set of parameters
# Parameters:
# - 'tfidf__ngram_range': Check both unigrams (1,1) and bigrams (1,2)
# - 'tfidf__use_idf': Whether to use IDF (inverse document frequency) scaling
# - 'clf__alpha': The regularization strength for the SVM classifier (smaller values mean stronger regularization)
parameters = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams or bigrams
    'tfidf__use_idf': [True, False],  # Use or not use inverse document frequency
    'clf__alpha': [1e-3, 1e-4],  # Regularization strength for SVM
}

# Use GridSearchCV to perform an exhaustive search over parameter combinations
# - cv=5: Perform 5-fold cross-validation for more reliable estimates
# - n_jobs=-1: Use all available cores to parallelize the grid search
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf.fit(X_train, y_train)  # Train the model with the best parameters found

# Print the best parameters found by the grid search
# This helps identify which parameter combination performed best during cross-validation
print("Best parameters found:")
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

# Evaluate the performance of the model on the test set
# The model is tested on data it has never seen before (X_test) to evaluate how well it generalizes
y_pred = gs_clf.predict(X_test)  # Make predictions on the test data

# Print the classification report:
# - precision: The number of correct positive predictions divided by the total number of positive predictions
# - recall: The number of correct positive predictions divided by the actual number of positives in the data
# - f1-score: The harmonic mean of precision and recall (provides a balance between the two)
# - support: The number of actual occurrences of each class ('pos' and 'neg')
print(metrics.classification_report(y_test, y_pred))

# Print the confusion matrix:
# - Shows how often the model correctly predicted the class (on-diagonal elements) and how often it misclassified (off-diagonal elements)
# - The rows represent the actual class, and the columns represent the predicted class
print(metrics.confusion_matrix(y_test, y_pred))

## Exercise 3

### Code for training and pickling the models:

In [None]:
import pickle
from sklearn.pipeline import Pipeline

# Assume that `text_clf_language` is the language detection pipeline from Exercise 1
# and `text_clf_sentiment` is the sentiment analysis pipeline from Exercise 2

# Train and pickle the language detection model
def save_language_model():
    # Train the language detection model (from Exercise 1)
    text_clf_language.fit(X_train_language, y_train_language)  # Use training data for language detection
    
    # Save the trained model to a file
    with open('language_model.pkl', 'wb') as f:
        pickle.dump(text_clf_language, f)

# Train and pickle the sentiment analysis model
def save_sentiment_model():
    # Train the sentiment analysis model (from Exercise 2)
    text_clf_sentiment.fit(X_train_sentiment, y_train_sentiment)  # Use training data for sentiment analysis
    
    # Save the trained model to a file
    with open('sentiment_model.pkl', 'wb') as f:
        pickle.dump(text_clf_sentiment, f)

# Call these functions after training to save the models
save_language_model()
save_sentiment_model()

### Code for the CLI utility:

In [None]:
import sys
import pickle
import argparse
from sklearn import metrics

# Load pickled models
with open('language_model.pkl', 'rb') as f:
    language_model = pickle.load(f)

with open('sentiment_model.pkl', 'rb') as f:
    sentiment_model = pickle.load(f)

# Command-line utility for text classification
def classify_text(input_text):
    # Step 1: Language Detection
    language_pred = language_model.predict([input_text])
    language_confidence = max(language_model.predict_proba([input_text])[0])  # Get max confidence score

    print(f"Detected language: {language_pred[0]} with confidence: {language_confidence:.4f}")

    # Step 2: Sentiment Analysis (if English is detected)
    if language_pred[0] == 'english':  # Assuming 'english' is the label for English language
        sentiment_pred = sentiment_model.predict([input_text])
        sentiment_confidence = max(sentiment_model.predict_proba([input_text])[0])  # Get max confidence score
        
        sentiment_label = 'positive' if sentiment_pred[0] == 'pos' else 'negative'
        print(f"Sentiment: {sentiment_label} with confidence: {sentiment_confidence:.4f}")
    else:
        print("Sentiment analysis is only available for English text.")

# Set up argument parsing
parser = argparse.ArgumentParser(description="CLI utility for language detection and sentiment analysis.")
parser.add_argument('--text', type=str, help="Text to classify. If not provided, input will be read from stdin.")

args = parser.parse_args()

# If --text argument is provided, classify that text, otherwise read from stdin
if args.text:
    classify_text(args.text)
else:
    print("Please provide some input text:")
    input_text = sys.stdin.read().strip()
    classify_text(input_text)

## Breakdown of the CLI Utility:

### Model loading:
- The models are loaded from their respective pickled files (`language_model.pkl` and `sentiment_model.pkl`).

### Classification process:
- The utility first predicts the **language** of the input text using the language detection model.
- If the detected language is **English**, the sentiment analysis model is used to predict whether the sentiment is **positive** or **negative**.

### Confidence score:
- Both the language detection and sentiment analysis models provide a confidence score by using the `predict_proba` method.
- The highest score from the predicted probabilities is displayed as the confidence level.

### Command-line arguments:
- The utility can accept input through the `--text` argument.
- If no text is provided via the argument, the utility will prompt for input via **stdin**.

### Usage:

#### To classify a string provided as a command-line argument:
```bash
cli_text_classification.py --text "This is a great movie!"

```

#### To classify text input from stdin:
```bash
echo "I love this movie!" | python cli_text_classification.py
```
