# 1. Stored the csv file to a variable

In [1]:
import pandas as pd

# Load the first 45,000 rows of the CSV file with proper handling for quoted fields
phone_items = pd.read_csv('phone_user_review_file_1.csv', encoding="utf-8", nrows=45000)

# Filter rows where 'lang' is 'en' because there's also other language inside the files
filtered_items = phone_items[phone_items['lang'] == 'en']

# Select only the 'extract' and 'product' columns
result = filtered_items[['extract', 'product']]

# Remove rows where 'extract' or 'product' contains NaN values
result_clean = result.dropna(subset=['extract', 'product'])

# Display the result to inspect the filtered data
result_clean.head()


Unnamed: 0,extract,product
0,As a diehard Samsung fan who has had every Sam...,Samsung Galaxy S8
1,Love the phone. the phone is sleek and smooth ...,Samsung Galaxy S8
2,Adequate feel. Nice heft. Processor's still sl...,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl..."
3,Never disappointed. One of the reasons I've be...,Samsung Galaxy S8 64GB (AT&T)
4,I've now found that i'm in a group of people t...,Samsung Galaxy S8


# 2. N-Gram Model

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
class NGramLanguageModel:
  def __init__(self, n):
    self.n = n
    self.vectorizer = CountVectorizer(analyzer='word', ngram_range=(n,n))
  def fit_transform(self, corpus):
    return self.vectorizer.fit_transform(corpus)
  def transform(self, corpus):
    return self.vectorizer.transform(corpus)

def calculate_cosine_similarity(matrix, query_v):
  similarities = cosine_similarity(query_v, matrix)
  return similarities

In [4]:
# Prepare data for processing
corpus = result_clean['extract'].tolist()
query = "Samsung"

In [6]:
n = 1
ngram_model = NGramLanguageModel(n)
matrix = ngram_model.fit_transform(corpus)
query_v = ngram_model.transform([query])
print(f"{n}-Gram Model:")
# in here, if you test it in colab, use .get_feature_names_out()
# if you test it in vscode, use .get_feature_names (if your scikit-learn is not updated to the latest)
print(pd.DataFrame(matrix.A, columns=ngram_model.vectorizer.get_feature_names()))
print(query_v)

1-Gram Model:
       00  000  00000000001  000mah  007  01  01pm  02  03  039  ...  ѕιnce  \
0       0    0            0       0    0   0     0   0   0    0  ...      0   
1       0    0            0       0    0   0     0   0   0    0  ...      0   
2       0    0            0       0    0   0     0   0   0    0  ...      0   
3       0    0            0       0    0   0     0   0   0    0  ...      0   
4       0    0            0       0    0   0     0   0   0    0  ...      0   
...    ..  ...          ...     ...  ...  ..   ...  ..  ..  ...  ...    ...   
26756   0    0            0       0    0   0     0   0   0    0  ...      0   
26757   0    0            0       0    0   0     0   0   0    0  ...      0   
26758   0    0            0       0    0   0     0   0   0    0  ...      0   
26759   0    0            0       0    0   0     0   0   0    0  ...      0   
26760   0    0            0       0    0   0     0   0   0    0  ...      0   

       ғeв  ғroм  աɨʟʟ  բɛɑtuʀɛร  բʀѳ

In [7]:
similarities = calculate_cosine_similarity(matrix, query_v)

data = {'Document': corpus, 'Similarity': similarities[0]}
df = pd.DataFrame(data)

print(query)
df

Samsung


Unnamed: 0,Document,Similarity
0,As a diehard Samsung fan who has had every Sam...,0.262613
1,Love the phone. the phone is sleek and smooth ...,0.000000
2,Adequate feel. Nice heft. Processor's still sl...,0.160128
3,Never disappointed. One of the reasons I've be...,0.251976
4,I've now found that i'm in a group of people t...,0.000000
...,...,...
26756,iPhone has been the great device since 2007(fr...,0.000000
26757,It has been 2 days i got my iPhone 6s delivere...,0.000000
26758,Mucho more better than prior model,0.000000
26759,The phone works well except for one issue the ...,0.000000


# 3. TF-IDF Model Word Embedding

In [8]:
# TF-IDF Model
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
query_tfidf_v = tfidf_vectorizer.transform([query])
tfidf_similarities = calculate_cosine_similarity(tfidf_matrix, query_tfidf_v)
df_tfidf = pd.DataFrame({
    'Document': corpus,
    'TF-IDF Similarity': tfidf_similarities[0]
})
print("TF-IDF Similarities:")
print(df_tfidf)

TF-IDF Similarities:
                                                Document  TF-IDF Similarity
0      As a diehard Samsung fan who has had every Sam...           0.171964
1      Love the phone. the phone is sleek and smooth ...           0.000000
2      Adequate feel. Nice heft. Processor's still sl...           0.082599
3      Never disappointed. One of the reasons I've be...           0.165114
4      I've now found that i'm in a group of people t...           0.000000
...                                                  ...                ...
26756  iPhone has been the great device since 2007(fr...           0.000000
26757  It has been 2 days i got my iPhone 6s delivere...           0.000000
26758                 Mucho more better than prior model           0.000000
26759  The phone works well except for one issue the ...           0.000000
26760  My phone was locked with AT&T, but I am happy ...           0.000000

[26761 rows x 2 columns]


# 4. Dependency Parsing

In [9]:
import spacy

In [18]:
# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_named_entities_and_parse_tree(sentence):
    doc = nlp(sentence)

    # Extract parts of speech
    pos_tags = {
        "punctuations": [token.text for token in doc if token.pos_ == "PUNCT"],
        "verbs": [token.text for token in doc if token.pos_ == "VERB"],
        "nouns": [token.text for token in doc if token.pos_ in ["NOUN", "PROPN"]],
    }

    # Display formatted dependency parse tree
    print("Formatted Dependency Parse Tree:")
    for token in doc:
        print(f"{token.text} --{token.dep_}--> {token.head.text} ({token.pos_})")

    return pos_tags

def main():
    # Limit to first 100 rows
    for index, row in result_clean.head(100).iterrows():
        extract_text = row['extract']
        print(f"\nAnalyzing Review {index+1}:")
        print("Review Text:", extract_text)
        pos_tags = extract_named_entities_and_parse_tree(extract_text)

        print("\nExtracted Parts of Speech:")
        print("Punctuations:", pos_tags["punctuations"])
        print("Verbs:", pos_tags["verbs"])
        print("Nouns:", pos_tags["nouns"])

if __name__ == "__main__":
    main()


Analyzing Review 1:
Review Text: As a diehard Samsung fan who has had every Samsung phone since before the S series started, this has been my favorite upgrade so far. I am amazed at some of the reviews and think people may just have a defective device that needs to be replaced. The battery life is amazing.
Formatted Dependency Parse Tree:
As --prep--> been (ADP)
a --det--> fan (DET)
diehard --amod--> fan (ADJ)
Samsung --compound--> fan (PROPN)
fan --pobj--> As (NOUN)
who --nsubj--> had (PRON)
has --aux--> had (AUX)
had --relcl--> fan (VERB)
every --det--> phone (DET)
Samsung --compound--> phone (PROPN)
phone --dobj--> had (NOUN)
since --mark--> started (SCONJ)
before --mark--> started (SCONJ)
the --det--> series (DET)
S --compound--> series (PROPN)
series --nsubj--> started (NOUN)
started --advcl--> been (VERB)
, --punct--> been (PUNCT)
this --nsubj--> been (PRON)
has --aux--> been (AUX)
been --ROOT--> been (AUX)
my --poss--> upgrade (PRON)
favorite --amod--> upgrade (ADJ)
upgrade --a

# 5. Grammar Parsing - with NLTK

In [11]:
import nltk
from nltk import CFG
from nltk.parse import ChartParser

In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
def demonstrate_nlp_parsing(sentence, grammar):
    words = nltk.word_tokenize(sentence)
    parser = ChartParser(grammar)

    try:
        parses = list(parser.parse(words))
        if parses:
            for tree in parses:
                print(tree, "\n")
                tree.pretty_print()
        else:
            print("No parses found")
    except Exception as e:
        print(f"Error during parsing: {e}")

In [14]:
# Define the expanded CFG with all words included
nlp_grammar = CFG.fromstring("""
  S -> NP VP | PP VP | S Conj S
  NP -> Det N | Det N PP | 'i' | 'I' | 'people' | 'this' | 'the' | 'a' | 'The'
  VP -> V NP | V NP PP | 'have' | 'had' | 'has' | 'been' | 'am' | 'is' | 'may' | 'think' | 'needs' | 'just'
  Det -> 'the' | 'a' | 'every' | 'some' | 'my'
  N -> 'upgrade' | 'life' | 'device' | 'battery' | 'reviews' | 'fan' | 'S' | 'Samsung' | 'phone' | 'series' | 'favorite' | 'diehard' | 'far' | 'defective' | 'amazing'
  V -> 'saw' | 'ate' | 'chased' | 'think' | 'have' | 'had' | 'is' | 'am' | 'started' | 'needs' | 'may' | 'replaced' | 'be' | 'amazed' | 'been'
  PP -> P NP
  P -> 'in' | 'on' | 'with' | 'at' | 'before' | 'since' | 'As' | 'so' | 'of' | 'to' | 'that'
  Conj -> ',' | ' '
""")

# Updated sentences
nlp_sentences = [
    "As a diehard Samsung fan, I have had every Samsung phone since before the S series started",
    "this has been my favorite upgrade so far",
    "I am amazed at some of the reviews",
    "I think people may just have a defective device that needs to be replaced",
    "The battery life is amazing"
]

In [15]:
for i, sentence in enumerate(nlp_sentences):
  print(f"Sentence {i+1}: {nlp_sentences[i]}")
  demonstrate_nlp_parsing(sentence, nlp_grammar)
  print("\n")

Sentence 1: As a diehard Samsung fan, I have had every Samsung phone since before the S series started
No parses found


Sentence 2: this has been my favorite upgrade so far
No parses found


Sentence 3: I am amazed at some of the reviews
No parses found


Sentence 4: I think people may just have a defective device that needs to be replaced
No parses found


Sentence 5: The battery life is amazing
No parses found




# 6. Application Testing

In [16]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-

True

In [19]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

# Load the data and model (assumed to be pre-trained)
phone_items = pd.read_csv('phone_user_review_file_1.csv', encoding="utf-8", nrows=45000)
filtered_items = phone_items[phone_items['lang'] == 'en']
result = filtered_items[['extract', 'product']]
result_clean = result.dropna(subset=['extract', 'product'])

# Prepare data for classification
corpus = result_clean['extract'].tolist()
categories = result_clean['product'].tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(corpus, categories, test_size=0.3, random_state=42)

# Train Naive Bayes model
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])
pipeline.fit(X_train, y_train)

# Predict on the test set and calculate accuracy
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Naive Bayes Accuracy: {accuracy * 100:.2f}%")

# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_named_entities_and_parse_tree(sentence):
    doc = nlp(sentence)
    pos_tags = {
        "punctuations": [token.text for token in doc if token.pos_ == "PUNCT"],
        "verbs": [token.text for token in doc if token.pos_ == "VERB"],
        "nouns": [token.text for token in doc if token.pos_ in ["NOUN", "PROPN"]],
    }
    print("Formatted Dependency Parse Tree:")
    for token in doc:
        print(f"{token.text} --{token.dep_}--> {token.head.text} ({token.pos_})")
    return pos_tags

def recommend_phone(review):
    # Predict the category of the review
    category = pipeline.predict([review])[0]
    # Create a TF-IDF vectorizer and fit it on the corpus
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

    # Transform the review into TF-IDF vector
    query_tfidf_v = tfidf_vectorizer.transform([review])

    # Calculate cosine similarity between the query and the corpus
    tfidf_similarities = cosine_similarity(tfidf_matrix, query_tfidf_v)

    # Add similarities to the DataFrame
    df_tfidf = pd.DataFrame({
        'Document': corpus,
        'Product': categories,
        'TF-IDF Similarity': tfidf_similarities.flatten()
    })

    # Filter by the predicted category
    recommendations = df_tfidf[df_tfidf['Product'] == category]

    # Sort by TF-IDF similarity and get top 5 recommendations
    top_recommendations = recommendations.sort_values(by='TF-IDF Similarity', ascending=False).head(5)

    # Include the original reviews in the output
    top_recommendations = top_recommendations.merge(result_clean[['extract', 'product']], left_on='Document', right_on='extract', how='left')

    return top_recommendations[['product', 'extract', 'TF-IDF Similarity']]

def menu():
    review = ""
    category = "UNKNOWN"

    while True:
        print("\nPHONE RECOMMENDATION APPLICATION BASED ON REVIEWS")
        print(f"YOUR REVIEW: {review if review else 'NO REVIEW'}")
        print(f"YOUR REVIEW CATEGORY: {category}")
        print("1. WRITE YOUR REVIEW")
        print("2. VIEW PHONE RECOMMENDATION")
        print("3. VIEW NAMED ENTITIES RECOGNITION")
        print("4. EXIT")
        choice = input("Choose an option: ")

        if choice == "1":
            review = input("Write your review: ")
            category = pipeline.predict([review])[0]
            print(f"Review categorized as: {category}")

        elif choice == "2":
            if review:
                recommendations = recommend_phone(review)
                print("\nTOP PHONE RECOMMENDATIONS FOR YOU:")
                if recommendations.empty:
                    print("No recommendations found.")
                else:
                    for idx, row in recommendations.iterrows():
                        print(f"{idx + 1}: {row['product']}")
                        print(f"   Review: {row['extract']}")
                        print(f"   Similarity: {row['TF-IDF Similarity']:.4f}")
            else:
                print("Please write a review first.")

        elif choice == "3":
            if review:
                pos_tags = extract_named_entities_and_parse_tree(review)
                print("\nExtracted Parts of Speech:")
                print("Punctuations:", pos_tags["punctuations"])
                print("Verbs:", pos_tags["verbs"])
                print("Nouns:", pos_tags["nouns"])
            else:
                print("Please write a review first.")

        elif choice == "4":
            print("Exiting...")
            break

        else:
            print("Invalid choice. Please try again.")

if __name__ == "__main__":
    menu()

Naive Bayes Accuracy: 10.61%

PHONE RECOMMENDATION APPLICATION BASED ON REVIEWS
YOUR REVIEW: NO REVIEW
YOUR REVIEW CATEGORY: UNKNOWN
1. WRITE YOUR REVIEW
2. VIEW PHONE RECOMMENDATION
3. VIEW NAMED ENTITIES RECOGNITION
4. EXIT
Review categorized as: Apple iPhone 6s Plus

PHONE RECOMMENDATION APPLICATION BASED ON REVIEWS
YOUR REVIEW: apple iphone is small
YOUR REVIEW CATEGORY: Apple iPhone 6s Plus
1. WRITE YOUR REVIEW
2. VIEW PHONE RECOMMENDATION
3. VIEW NAMED ENTITIES RECOGNITION
4. EXIT

TOP PHONE RECOMMENDATIONS FOR YOU:
1: Apple iPhone 6s Plus
   Review: Apple is an awesome company!
   Similarity: 0.2877
2: Apple iPhone 6s Plus
   Review: In my opinion, the iPhone 6s Plus is the best iPhone to date. It has a perfect size screen that is not too big and not too small. It has great features, excellent performance, and much better than any Android phone on the market today. I love how the set up of the iPhone always stays the same.
   Similarity: 0.2783
3: Apple iPhone 6s Plus
   Review: