In [1]:
import csv
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download the stopwords and wordnet lemmatizer data
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_answers(input_file, output_file):
  with open(input_file, 'r') as input_csv, open(output_file, 'w') as output_csv:
    reader = csv.reader(input_csv)
    writer = csv.writer(output_csv)

    # Initialize the WordNet lemmatizer and stop words set
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Iterate over the rows in the input CSV
    for row in reader:
      question = row[0]
      answer = row[1]

      # Pre-process the answer by lowercasing it and removing any punctuation
      answer = answer.lower()
      answer = re.sub(r'[^\w\s]', '', answer)

      # Tokenize the answer
      tokens = nltk.word_tokenize(answer)

      # Remove stop words and lemmatize the remaining tokens
      tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

      # Join the lemmatized tokens back into a single string
      answer = ' '.join(tokens)

      # Write the question and pre-processed answer to the output CSV
      writer.writerow([question, answer])

preprocess_answers('/content/sample_data/fyp.csv', 'output.csv')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import csv
import random

def split_data(input_file, train_output_file, test_output_file, test_size=0.2):
  with open(input_file, 'r') as input_csv, \
       open(train_output_file, 'w') as train_csv, \
       open(test_output_file, 'w') as test_csv:
    reader = csv.reader(input_csv)
    train_writer = csv.writer(train_csv)
    test_writer = csv.writer(test_csv)

    # Read in all the rows from the input CSV file
    rows = [row for row in reader]

    # Randomly shuffle the rows
    random.shuffle(rows)

    # Calculate the size of the test set
    test_size = int(len(rows) * test_size)

    # Split the rows into the train and test sets
    train_rows = rows[test_size:]
    test_rows = rows[:test_size]

    # Write the train and test rows to the output CSV files
    train_writer.writerows(train_rows)
    test_writer.writerows(test_rows)

split_data('output.csv', 'train.csv', 'test.csv')




In [3]:
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
questions = []
answers = []
with open('train.csv', 'r') as train_csv:
  reader = csv.reader(train_csv)
  for row in reader:
    questions.append(row[0])
    answers.append(row[1])

# Preprocess the data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(questions)
y = answers

# Train a logistic regression model
model = LogisticRegression()
model.fit(X, y)

LogisticRegression()

In [67]:
import csv
import random
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Read the questions and answers from the train.csv file
questions = []
answers = []
with open('train.csv', 'r') as train_csv:
  reader = csv.reader(train_csv)
  for row in reader:
    questions.append(row[0])
    answers.append(row[1])

# Choose a random question and answer from the train.csv file
i = random.randint(0, len(questions)-1)
question = questions[i]
dataset_answer = answers[i]

# Print the chosen question
print("Question:", question)

# Preprocess the user's answer and the answer in the training dataset
user_answer = input("Enter your answer: ")
vectorizer = CountVectorizer()
X_user = vectorizer.fit_transform([user_answer])
X_dataset = vectorizer.fit_transform([dataset_answer])



X_shape = X_user.shape
Y_shape = X_dataset.shape

# If the number of columns in the X matrix is not equal to the number of columns in the Y matrix,
# add or remove columns from one of the matrices as needed
if X_shape[1] != Y_shape[1]:
    # Calculate the difference in the number of columns
    diff = Y_shape[1] - X_shape[1]
    if diff > 0:
        # Add columns to the X matrix
        for i in range(diff):
            X_user = np.append(X_user, 0)
    else:
        # Add columns to the Y matrix
        for i in range(abs(diff)):
            X_dataset = np.append(X_dataset, 0)




#X_user = X_user.todense()
#X_dataset = X_dataset.todense()
#X_user = csr_matrix(X_user).toarray()
#X_dataset = csr_matrix(X_dataset).toarray()


# Compute the cosine similarity between the user's answer and the answer in the training dataset
similarity = cosine_similarity(X_user, X_dataset)
score = similarity[0][0]

# Print the similarity score
print("Similarity score:", score)


Question: In speech recognition what kind of signal is used?
Enter your answer: peech recognition acoustic signal used identify sequence word
Similarity score: 0.9999999999999999
