<a href="https://colab.research.google.com/github/InukshiSenarathne/chatbot_colab_testing/blob/main/TEXT_ANALYZER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

!pip install spacy
!python -m spacy download en_core_web_sm


In [None]:
import json
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from spacy.pipeline import DependencyParser
from spacy.lang.en import English
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Preprocessing the Data
def preprocess_text(text):
    # Remove punctuation and symbols
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return text

# Step 2: Splitting the Data
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

# Step 3: Building the Models
def train_models(X_train, y_train):
    models = []
    
    # Random Forest Regression
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train, y_train)
    models.append(('Random Forest', rf_model))
    
    # K-Nearest Neighbors Regression
    knn_model = KNeighborsRegressor()
    knn_model.fit(X_train, y_train)
    models.append(('KNN', knn_model))
    
    # Gradient Boosting Regressor
    gb_model = GradientBoostingRegressor()
    gb_model.fit(X_train, y_train)
    models.append(('Gradient Boosting', gb_model))
    
    return models

def evaluate_models(models, X_test, y_test):
    for name, model in models:
        y_pred = model.predict(X_test)
        y_pred = np.around(y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        print(f'{name} - Precision: {0}, Recall: {recall}, F1-score: {f1}')

# Step 4: Plotting the Graphs
def plot_performance(models, X_test, y_test):
    names = []
    precisions = []
    recalls = []
    f1_scores = []
    
    for name, model in models:
        y_pred = model.predict(X_test)
        y_pred = np.around(y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
    
    x = np.arange(len(names))
    width = 0.2
    
    fig, ax = plt.subplots()
    ax.bar(x - width, precisions, width, label='Precision')
    ax.bar(x, recalls, width, label='Recall')
    ax.bar(x + width, f1_scores, width, label='F1-score')
    
    ax.set_ylabel('Scores')
    ax.set_title('Model Performance')
    ax.set_xticks(x)
    ax.set_xticklabels(names)
    ax.legend()
    
    plt.show()

# Step 5: Prediction Model
def train_prediction_model(X, y):
    # Select the best-performing model (Random Forest) based on evaluation results
    rf_model = RandomForestRegressor()
    rf_model.fit(X, y)
    return rf_model

def predict_traits(model, text):
    # Preprocess the text
    preprocessed_text = preprocess_text(text)
    
    # Vectorize the preprocessed text
    vectorized_text = vectorizer.transform([preprocessed_text])
    
    # Predict the traits
    traits = model.predict(vectorized_text)
    
    return traits

# Excecution Start
# Load JSON object
with open('drive/MyDrive/research_project_y4_sliit/sample_data/interview_data.json') as f:
    data = json.load(f)
    text_data = data['text']

# Preprocess the text data
preprocessed_data = [preprocess_text(text) for text in text_data]

# Vectorize the preprocessed data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_data)

# Generate random labels for demonstration purposes (replace with actual labels)
y = np.random.randint(1, 6, len(text_data))
print(y)
# y = ['Kind', '']

# Split the data
X_train, X_test, y_train, y_test = split_data(X, y)

# Train the models
models = train_models(X_train, y_train)

# Evaluate the models
evaluate_models(models, X_test, y_test)

# Plot the performance of the models
#plot_performance(models, X_test, y_test)

# Train the prediction model
prediction_model = train_prediction_model(X, y)

# Predict traits for new candidates
new_text = "I am a highly motivated individual who enjoys working in a team."
traits = predict_traits(prediction_model, new_text)
print(f"Predicted traits: {traits}")


[2 3 1 5 4 4 2 1 4 1 2 1 3 1 3 1 2 4 1 4 4 2 1 4 4 1 1 4 1 1 1 5 1 2]
Random Forest - Precision: 0, Recall: 0.3333333333333333, F1-score: 0.16666666666666666
KNN - Precision: 0, Recall: 0.0, F1-score: 0.0
Gradient Boosting - Precision: 0, Recall: 0.3333333333333333, F1-score: 0.09523809523809523
Predicted traits: [1.77]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import json
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from spacy.pipeline import DependencyParser
from spacy.lang.en import English
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Preprocessing the Data
def preprocess_text(text):
    # Remove punctuation and symbols
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Convert text to lowercase
    text = text.lower()

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return text

# Step 2: Splitting the Data
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

# Step 3: Building the Models
def train_models(X_train, y_train):
    models = []

    # Random Forest Regression
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train, y_train)
    models.append(('Random Forest', rf_model))

    # K-Nearest Neighbors Regression
    knn_model = KNeighborsRegressor()
    knn_model.fit(X_train, y_train)
    models.append(('KNN', knn_model))

    # Gradient Boosting Regressor
    gb_model = GradientBoostingRegressor()
    gb_model.fit(X_train, y_train)
    models.append(('Gradient Boosting', gb_model))

    return models

def evaluate_models(models, X_test, y_test):
    for name, model in models:
        y_pred = model.predict(X_test)
        y_pred = np.around(y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        print(f'{name} - Precision: {precision}, Recall: {recall}, F1-score: {f1}')

# Step 4: Plotting the Graphs
def plot_performance(models, X_test, y_test):
    names = []
    precisions = []
    recalls = []
    f1_scores = []

    for name, model in models:
        y_pred = model.predict(X_test)
        y_pred = np.around(y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

    x = np.arange(len(names))
    width = 0.2

    fig, ax = plt.subplots()
    ax.bar(x - width, precisions, width, label='Precision')
    ax.bar(x, recalls, width, label='Recall')
    ax.bar(x + width, f1_scores, width, label='F1-score')

    ax.set_ylabel('Scores')
    ax.set_title('Model Performance')
    ax.set_xticks(x)
    ax.set_xticklabels(names)
    ax.legend()

    plt.show()

# Step 5: Prediction Model
def train_prediction_model(X, y):
    # Select the best-performing model (Random Forest) based on evaluation results
    rf_model = RandomForestRegressor()
    rf_model.fit(X, y)
    return rf_model

def predict_traits(model, text):
    # Preprocess the text
    preprocessed_text = preprocess_text(text)

    # Vectorize the preprocessed text
    vectorized_text = vectorizer.transform([preprocessed_text])

    # Predict the traits
    traits = model.predict(vectorized_text)

    return traits

# Load JSON object
with open('drive/MyDrive/research_project_y4_sliit/sample_data/interview_data.json') as f:
    data = json.load(f)
    text_data = data['text']

# Preprocess the text data
preprocessed_data = [preprocess_text(text) for text in text_data]

# Vectorize the preprocessed data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_data)

# Load the labeled dataset for Big Five personality traits
with open('drive/MyDrive/research_project_y4_sliit/sample_data/labled_data.json') as f:
    labeled_data = json.load(f)
    labels = labeled_data['labels']

# Convert labels to NumPy array
y = np.array(labels)

# Split the data
X_train, X_test, y_train, y_test = split_data(X, y)

# Train the models
models = train_models(X_train, y_train)

# Evaluate the models
evaluate_models(models, X_test, y_test)

# Plot the performance of the models
# plot_performance(models, X_test, y_test)

# Train the prediction model
prediction_model = train_prediction_model(X, y)

# Predict traits for new candidates
new_text = "I am a highly motivated individual who enjoys working in a team."
traits = predict_traits(prediction_model, new_text)
print(f"Predicted traits: {traits}")


ValueError: ignored