##Use vector space model and cosine similarity for text classification.

In [None]:
!pip install odfpy

Collecting odfpy
  Downloading odfpy-1.4.1.tar.gz (717 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/717.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m716.8/717.0 kB[0m [31m21.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m717.0/717.0 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: odfpy
  Building wheel for odfpy (setup.py) ... [?25l[?25hdone
  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160671 sha256=575742679f3f0f2fb0eb71e1eedffa10a682779a0074ae28ccc43d65243c7110
  Stored in directory: /root/.cache/pip/wheels/c8/2e/95/90d94fe33903786937f3b8c33dd88807f792359c6424b40469
Successfully built odfpy
Installing collected packages: odfpy
Successfully installed odfpy-1.4.1


In [None]:
import pandas as pd
df = pd.read_excel('ChinaJapan.ods', engine='odf')
print(df.head())

   Doc                    Words  Class
0    1   Chinese Beijing Chinese     c
1    2  Chinese Chinese Shanghai     c
2    3             Chinese Macao     c
3    4       Tokyo Japan Chinese     j


###a.Construct the vector space model (Preprocess the text, calculate Bag of words and TF-IDF) and compute the importance of the word Chinese in the test data.

In [None]:
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import pandas as pd
df = pd.read_excel('ChinaJapan.ods', engine='odf')
print(df.head())

print(df.columns)

docs = []
for i in range(len(df)):

  doc = df.iloc[i]['Words ']
  tokens = word_tokenize(doc)
  tokens = [t.lower() for t in tokens if t.isalnum()]
  stop_words = set(stopwords.words('english'))
  tokens = [t for t in tokens if t not in stop_words]
  stemmer = PorterStemmer()
  tokens = [stemmer.stem(t) for t in tokens]
  docs.append(tokens)

df['Processed'] = docs
print(df.head())

   Doc                    Words  Class
0    1   Chinese Beijing Chinese     c
1    2  Chinese Chinese Shanghai     c
2    3             Chinese Macao     c
3    4       Tokyo Japan Chinese     j
Index(['Doc', 'Words ', 'Class'], dtype='object')
   Doc                    Words  Class                   Processed
0    1   Chinese Beijing Chinese     c      [chines, beij, chines]
1    2  Chinese Chinese Shanghai     c  [chines, chines, shanghai]
2    3             Chinese Macao     c             [chines, macao]
3    4       Tokyo Japan Chinese     j      [tokyo, japan, chines]


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(df['Processed'].apply(lambda x: ' '.join(x)))


feature_names = vectorizer.get_feature_names_out()


bow_df = pd.DataFrame(bow.toarray(), columns=feature_names)


print(bow_df)


   beij  chines  japan  macao  shanghai  tokyo
0     1       2      0      0         0      0
1     0       2      0      0         1      0
2     0       1      0      1         0      0
3     0       1      1      0         0      1


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the processed documents
tfidf = tfidf_vectorizer.fit_transform(df['Processed'].apply(lambda x: ' '.join(x)))

# Get the feature names (unique words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf.toarray(), columns=feature_names)

# Display the TF-IDF representation
print(tfidf_df)


       beij    chines     japan     macao  shanghai     tokyo
0  0.691835  0.722056  0.000000  0.000000  0.000000  0.000000
1  0.000000  0.722056  0.000000  0.000000  0.691835  0.000000
2  0.000000  0.462637  0.000000  0.886548  0.000000  0.000000
3  0.000000  0.346182  0.663385  0.000000  0.000000  0.663385


In [None]:
test_text = input("Enter test data: ")

test_tokens = word_tokenize(test_text.lower())
test_tokens = [t for t in test_tokens if t.isalnum() and t not in stop_words]
test_tokens = [stemmer.stem(t) for t in test_tokens]

test_tfidf = tfidf_vectorizer.transform([' '.join(test_tokens)])

try:
  chinese_index = np.where(feature_names == 'chines')[0][0]
  importance = test_tfidf[0, chinese_index]
  print("Importance of 'Chinese' in test data:", importance)
except IndexError:
  print("The word 'Chinese' (or its stemmed form) is not present in the test data.")

Enter test data: Chinese Chinese Chinese Tokyo Japan
Importance of 'Chinese' in test data: 0.7420574954436144


###b.Find the similarity of the test data considering with any one document from training data using the cosine similarity evaluation metric

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

doc_index = int(input("Enter the index of the document from training data to compare with (0 to {}): ".format(len(df)-1)))

similarity = cosine_similarity(test_tfidf, tfidf[doc_index])
print("Cosine similarity with document {}: {}".format(doc_index, similarity[0][0]))


Enter the index of the document from training data to compare with (0 to 3): 2
Cosine similarity with document 2: 0.34330349920760334


###c.Take a dataset of your own labelled with sentiment. Split the training and testing part and compute the sentiment classification with the application of Laplace smoothing

In [None]:
df1 = pd.read_excel('SA.ods', engine='odf')
print(df1.head())

   No.                                          Text  Class
0    1      I had a wonderful time at the park today      1
1    2                     This new phone is amazing      1
2    3             The movie I watched was very bad       0
3    4  I love my new job, the team is so supportive      1
4    5                     I had a bad day at work.       0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

X = df1['Text']
y = df1['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalnum() and t not in stop_words]
    tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)


X_train_processed = X_train.apply(preprocess_text)
X_test_processed = X_test.apply(preprocess_text)

# TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_processed)
X_test_tfidf = vectorizer.transform(X_test_processed)

# Naive Bayes classifier
nb_classifier = MultinomialNB(alpha=1.0)
nb_classifier.fit(X_train_tfidf, y_train)

y_pred = nb_classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# User input for classification
user_input = input("Enter text to classify sentiment: ")
user_input_tokens = word_tokenize(user_input.lower())
user_input_tokens = [t for t in user_input_tokens if t.isalnum() and t not in stop_words]
user_input_tokens = [stemmer.stem(t) for t in user_input_tokens]
user_input_tfidf = vectorizer.transform([' '.join(user_input_tokens)])

predicted_class = nb_classifier.predict(user_input_tfidf)[0]

if predicted_class == 1:
  print("Positive")
elif predicted_class == 0:
  print("Negative")


Accuracy: 1.0
Enter text to classify sentiment: The interviewer was rude, but I did well
Positive
