In [3]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import nltk
from nltk.corpus import words


# Download NLTK data is available
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('words', quiet=True)

#Build and Train Model
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import joblib


In [4]:

def cleaned_text(raw_text):

    # Ensure input is a string
    if not isinstance(raw_text, str):
        raw_text = str(raw_text)

    # Convert to lowercase
    text = raw_text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove bash-related content
    text = re.sub(r'#.*', '', text)  # Remove comments
    text = re.sub(r'^\s*\$\s*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^(bash\$|➜|→|>\s*)', '', text, flags=re.MULTILINE)
    text = re.sub(r'\\[\$`"\'\\]', '', text)

    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [5]:
def remove_special_characters(text):

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()


In [6]:
def remove_non_english_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s.,!?;:'\"()\-]", "", text)

    # Split text into words
    words_list = text.split()

    # Filter out non-English words
    english_words = set(words.words())
    filtered_words = [word for word in words_list if word in english_words]

    return ' '.join(filtered_words)

In [7]:
# Tokenization
def tokenize_and_process_text(text):
    """
    Tokenize, remove stopwords, and lemmatize text
    """
    # Ensure input is a string
    if not isinstance(text, str):
        text = str(text)

    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stopwords and short words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if
              token.isalnum() and
              token not in stop_words and
              len(token) > 2]

In [8]:
#Remove stopwords
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    tokens = re.sub(r'\b([a-zA-Z])\1{2,}\b', '', tokens) #remove words such as lll...

    # Filter out stop words and very short words (less than 3 characters)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]


    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

In [9]:
# Load and preprocess data
data = pd.read_csv("sample1.csv.gz")

    # Filter and clean data
data_filtered = data[['issue_label', 'issue_body']].dropna()
data_filtered = data_filtered.drop_duplicates(subset=['issue_body'])

In [10]:
 # Apply text cleaning
data_filtered['issue_body'] = data_filtered['issue_body'].apply(cleaned_text)

cleaned_data = data_filtered[['issue_label', 'issue_body']]

# Save the filtered data to a new file (optional)
cleaned_data.to_csv('cleaned_data.csv',index=False)

In [11]:
# Split data into features and target
X = data_filtered['issue_body']
y = data_filtered['issue_label']

In [12]:
    # Create pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        min_samples_split=5,
        random_state=42
    ))
])

In [13]:
    # Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=39)

In [14]:
    # Fit the model
pipeline.fit(X_train, y_train)

In [15]:

# Predictions
y_pred = pipeline.predict(X_test)

In [16]:
# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Classification Report:
              precision    recall  f1-score   support

         bug       0.74      0.81      0.77      6649
 enhancement       0.69      0.77      0.72      4949
    question       1.00      0.00      0.01      1178

    accuracy                           0.72     12776
   macro avg       0.81      0.53      0.50     12776
weighted avg       0.74      0.72      0.68     12776

Accuracy: 71.77%


In [17]:
# Cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5)
print("\nCross-validation Scores:")
print(cv_scores)
print(f"Mean CV Score: {cv_scores.mean() * 100:.2f}%")


Cross-validation Scores:
[0.70608954 0.70716243 0.71483366 0.71021526 0.71569472]
Mean CV Score: 71.08%


In [18]:
# Example input string (this will be the text you want to classify)
input_text = "Is your feature request related to a problem?When running an application offline, it can often be useful to have spans of internal metrics recorded for troubleshooting"

# Prepare the input data (make sure it's in DataFrame format)
input_data = pd.DataFrame({'issue_body': [input_text]})


# Make prediction using the trained model pipeline
prediction = pipeline.predict(input_data)

# Output the prediction
print(f'Predicted Label: {prediction[0]}')

Predicted Label: enhancement


In [19]:
    # Save the trained model for later use in Flask app
joblib.dump(pipeline, 'trainmodel.h5')

# Load model function for Flask app
def load_model():
        return joblib.load('trainmodel.h5')