In [1]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report, confusion_matrix
import fasttext
from itertools import product

# Load training and testing data
train_path = 'train.xlsx'
test_path = 'test.xlsx'
train_df = pd.read_excel(train_path)
test_df = pd.read_excel(test_path)

# Preprocessing functions
def preprocess(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = word_tokenize(text)  # Tokenize
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]  # Lemmatize
    return ' '.join(text)

# Preprocess training and testing data
train_df['bug_description'] = train_df['report'].apply(preprocess)
test_df['bug_description'] = test_df['report'].apply(preprocess)

# Define a function to train a FastText model
def train_fasttext_model(train_df, test_df, ngram_range=(2, 2), word_ngrams=1):
    train_data = []
    for index, row in train_df.iterrows():
        train_data.append(f"__label__{row['class_name']} {row['bug_description']}")
    model = fasttext.train_supervised(input=train_data, wordNgrams=word_ngrams, minn=ngram_range[0], maxn=ngram_range[1])
    test_data = [row['bug_description'] for index, row in test_df.iterrows()]
    true_labels = [row['class_name'] for index, row in test_df.iterrows()]
    predicted_labels = [model.predict(text)[0][0].replace('__label__', '') for text in test_data]
    print("Classification Report:")
    print(classification_report(true_labels, predicted_labels))
    print("Confusion Matrix:")
    print(confusion_matrix(true_labels, predicted_labels))

# Define a function to evaluate FastText models with different n-gram ranges
def evaluate_fasttext_models(train_df, test_df):
    ngram_ranges = [(i, j) for i in range(2, 5) for j in range(2, 10)]
    word_ngrams = [1]  # Change this if you want to try different values
    for ngram_range, word_ngram in product(ngram_ranges, word_ngrams):
        print(f"Evaluating with ngram_range: {ngram_range}, word_ngrams: {word_ngram}")
        train_fasttext_model(train_df, test_df, ngram_range=ngram_range, word_ngrams=word_ngram)

# Evaluate FastText models
evaluate_fasttext_models(train_df, test_df)


ModuleNotFoundError: No module named 'fasttext'