In [None]:
# TASK 1:

# As part of your academic research, you are tasked with implementing the User Profile Correlation-
# Based Similarity (UPCSim) algorithm for movie recommendations, as outlined in a specific research paper.
# Paper Understanding:
# Carefully read the research paper to understand the theoretical foundations and methodology of the UPCSim algorithm. Take note of the key concepts, equations, and algorithmic steps.

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [4]:
tags_df = pd.read_csv(r'C:\Users\i\Downloads\archive(6)\genome_tags.csv') 
print(tags_df.head())

   tagId           tag
0      1           007
1      2  007 (series)
2      3  18th century
3      4         1920s
4      5         1930s


In [5]:
tags_df['label'] = tags_df['tag'].apply(lambda x: 'ham' if len(x) > 5 else 'spam')

In [6]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(tags_df['tag'])
y = tags_df['label']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [9]:
y_pred = model.predict(X_test)

In [10]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))

Accuracy: 0.8362831858407079
Confusion Matrix:
 [[189   1]
 [ 36   0]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.84      0.99      0.91       190
        spam       0.00      0.00      1.00        36

    accuracy                           0.84       226
   macro avg       0.42      0.50      0.96       226
weighted avg       0.71      0.84      0.93       226



In [None]:
# TASK 2:
# You are part of a development team tasked with creating a cutting-edge content recommendation
# system for a movie streaming platform. The platform caters to a diverse audience, from casual
# viewers to avid movie enthusiasts. The primary objective is to enhance the user experience by
# offering personalized movie recommendations that align with individual preferences, viewing
# habits, and thematic interests
# Create a Content-Based Recommendation System for MovieLens Dataset

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

In [13]:
tags_df = pd.read_csv(r'C:\Users\i\Downloads\archive(6)\genome_tags.csv')  
print(tags_df.head())

   tagId           tag
0      1           007
1      2  007 (series)
2      3  18th century
3      4         1920s
4      5         1930s


In [14]:
sentiment_lexicon = {
    'good': 'positive', 'excellent': 'positive', 'amazing': 'positive',
    'bad': 'negative', 'boring': 'negative', 'terrible': 'negative'
}
tags_df['sentiment'] = tags_df['tag'].apply(lambda x: sentiment_lexicon.get(x.lower(), 'neutral'))

In [15]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(tags_df['tag'])
y = tags_df['sentiment']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [17]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)

In [19]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))

Accuracy: 0.995575221238938
Classification Report:
               precision    recall  f1-score   support

    negative       1.00      0.00      0.00         1
     neutral       1.00      1.00      1.00       225

    accuracy                           1.00       226
   macro avg       1.00      0.50      0.50       226
weighted avg       1.00      1.00      0.99       226



In [None]:
# TASK 3:
# You are a data scientist working on an email filtering system for a large email service provider. The
# goal is to automatically categorize incoming emails into "Spam" or "Ham" (non-spam) categories.
# The company has decided to leverage Natural Language Processing (NLP) techniques and a
# Multinomial Naive Bayes classifier for this task.
# How would you design and implement a spam and ham classification system using NLP and Multinomial Naive Bayes? Consider the
# challenges associated with distinguishing between
# legitimate and spam emails

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re

In [21]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\i\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\i\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
data = {
    'email': [
        "Win a free iPhone now! Click here.",
        "Meeting rescheduled to 10 AM tomorrow.",
        "Claim your lottery prize today.",
        "Project update: deadline extended to next week.",
        "Congratulations, you have been selected for a free trip to Bali!",
        "Don't forget our team lunch tomorrow at 12 PM."
    ],
    'label': ['spam', 'ham', 'spam', 'ham', 'spam', 'ham']
}
df = pd.DataFrame(data)

In [23]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

df['processed_email'] = df['email'].apply(preprocess_text)

In [24]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['processed_email'])
y = df['label']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [26]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [27]:
y_pred = model.predict(X_test)

In [28]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

Accuracy: 1.0

Confusion Matrix:
 [[1 0]
 [0 1]]

Classification Report:
               precision    recall  f1-score   support

         ham       1.00      1.00      1.00         1
        spam       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [None]:
# TASK 4:
# Imagine you are part of a team developing a sentiment analysis system for an e-commerce platform.
# The goal is to automatically classify product reviews into positive, negative, or neutral sentiments
# using a Naive Bayes classifier. This system will help the platform understand customer feedback
# more efficiently and improve the overall user experience.
# Exercise:
# Design and implement a Naive Bayes machine learning model for sentiment analysis based on product reviews.

In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [30]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\i\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\i\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
data = {
    'review': [
        "This product is amazing, I love it!",
        "Worst purchase I’ve ever made.",
        "The product is okay, nothing special.",
        "Absolutely fantastic! Highly recommend.",
        "Not worth the money.",
        "The best purchase ever, I am so happy with this."
    ],
    'sentiment': ['positive', 'negative', 'neutral', 'positive', 'negative', 'positive']
}

df = pd.DataFrame(data)

In [32]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

In [33]:
df['processed_review'] = df['review'].apply(preprocess_text)

In [34]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_review'])
y = df['sentiment']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [36]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [37]:
y_pred = model.predict(X_test)

In [38]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

Accuracy: 0.3333333333333333

Confusion Matrix:
 [[1 0 0]
 [0 0 0]
 [1 1 0]]

Classification Report:
               precision    recall  f1-score   support

    negative       0.50      1.00      0.67         1
     neutral       0.00      1.00      0.00         0
    positive       1.00      0.00      0.00         2

    accuracy                           0.33         3
   macro avg       0.50      0.67      0.22         3
weighted avg       0.83      0.33      0.22         3

