<a href="https://colab.research.google.com/github/LouLouoffer/Customer-Reviews-Analysis-and-Topic-Modeling/blob/main/ETL%2Bpipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import

In [None]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
#https://drive.google.com/file/d/1MFLww8ziLlIrKi6q8YU5_EXDgiyEi8_p/view?usp=share_link
file = drive.CreateFile({'id':'1MFLww8ziLlIrKi6q8YU5_EXDgiyEi8_p'}) # replace the id with id of file you want to access
file.GetContentFile('watch_reviews.zip')  # tab-separated

In [None]:
import numpy as np
import pandas as pd

import glob
import xml.etree.ElementTree as ET
from datetime import datetime

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [None]:
!unzip watch_reviews.zip

Archive:  watch_reviews.zip
   creating: watch_reviews/
  inflating: watch_reviews/.DS_Store  
  inflating: __MACOSX/watch_reviews/._.DS_Store  
  inflating: watch_reviews/watch_reviews2.tsv  
  inflating: __MACOSX/watch_reviews/._watch_reviews2.tsv  
  inflating: watch_reviews/watch_reviews1.tsv  
  inflating: __MACOSX/watch_reviews/._watch_reviews1.tsv  
  inflating: watch_reviews/watch_reviews1.csv  
  inflating: __MACOSX/watch_reviews/._watch_reviews1.csv  
  inflating: watch_reviews/watch_reviews2.csv  
  inflating: __MACOSX/watch_reviews/._watch_reviews2.csv  


In [None]:
tmpfile    = "watch_reviews_temp.tmp"               # store all extracted data

logfile    = "watch_reviews_logfile.txt"            # all event logs will be stored

targetfile = "watch_reviews_transformed_data.csv"   # transformed data is stored

In [None]:
def extract_from_tsv(file_to_process):
    dataframe = pd.read_csv(file_to_process, sep='\t', error_bad_lines=False)
    return dataframe

def extract_from_json(file_to_process):
    dataframe = pd.read_json(file_to_process,lines=True)
    return dataframe

def extract_from_csv(file_to_process):
    dataframe = pd.read_csv(file_to_process)
    return dataframe

In [None]:
def extract():
      extracted_data = pd.DataFrame(columns=['marketplace','customer_id','review_id','product_id','product_parent','product_title','product_category','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_body','review_date'])
    #for csv files
      for csvfile in glob.glob("watch_reviews/*.csv"):
          extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)
    #for json files
      for jsonfile in glob.glob("watch_reviews/*.json"):
          extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)
    #for tsv files
      for csvfile in glob.glob("watch_reviews/*.tsv"):
          extracted_data = extracted_data.append(extract_from_tsv(csvfile), ignore_index=True)
      return extracted_data

In [None]:
extract()

# Transfrom

In [None]:
def transform(data):
       data.dropna(subset=['review_body'],inplace=True)
       data.reset_index(inplace=True, drop=True)
       return data

# NLP model pipeline

In [None]:
data = data.loc[:999, 'review_body'].tolist()
data = transform(df)

In [None]:
    # Use nltk's English stopwords.
    stopwords = nltk.corpus.stopwords.words('english') #stopwords.append("n't")
    stopwords.append("'s")
    stopwords.append("'m")
    stopwords.append("br") #html <br>
    stopwords.append("watch")

In [None]:
import nltk
nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
# from nltk.stem import WordNetLemmatizer

stemmer = SnowballStemmer("english")

# tokenization and stemming
def tokenization_and_stemming(text):
    tokens = []
    # checks if each token in tokens is a stop word,if it is not a stop word, the token is converted to lowercase and added to a new list:
    for word in nltk.word_tokenize(text):
        if word.lower() not in stopwords:
            tokens.append(word.lower())

    filtered_tokens = []

    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if token.isalpha():
            filtered_tokens.append(token)

    # stemming
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
def build_model():

    model_pipeline =  Pipeline([
        ('tfidf', TfidfVectorizer(max_df=0.99, max_features=1000,
                                 min_df=0.005, stop_words='english',
                                 use_idf=True, tokenizer=tokenization_and_stemming, ngram_range=(1,2))),
        ('kmeans', KMeans(n_clusters = 5))
    ])
    return model_pipeline


In [None]:
def model_result(data):
  feature_names = model.named_steps['tfidf'].get_feature_names_out()

  # Get the cluster centers
  cluster_centers = model.named_steps['kmeans'].cluster_centers_

  # Get the top n features for each cluster
  n = 10
  for i, center in enumerate(cluster_centers):
      print(f"Cluster {i}:")
      top_features_idx = center.argsort()[::-1][:n]
      top_features = [feature_names[idx] for idx in top_features_idx]
      print(top_features)

In [None]:
model = build_model()
model.fit(data)

In [None]:
model_result(data)

Cluster 0:
['excel', 'product', 'good product', 'product recommend', 'recommend', 'good', 'thank', 'condit', 'qualiti', 'high']
Cluster 1:
['like', 'band', 'nice', 'look', 'perfect', 'time', 'work', 'beauti', 'love', 'wear']
Cluster 2:
['love', 'nice', 'husband', 'husband love', 'daughter', 'daughter love', 'beauti love', 'classi', 'beauti', 'everyth']
Cluster 3:
['great', 'look', 'great look', 'look great', 'great price', 'work', 'price', 'thank', 'work great', 'qualiti']
Cluster 4:
['good', 'nice', 'look', 'look good', 'good qualiti', 'price', 'qualiti', 'good look', 'good price', 'time']


In [None]:
'''
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.99, max_features=1000, min_df=0.01, stop_words='english', use_idf=True,
                              tokenizer=tokenization_and_stemming, ngram_range=(1,3))),
    ('kmeans', KMeans(n_clusters=5))
])

# Fit the pipeline on the corpus
pipeline.fit(data)
'''

In [None]:
'''
feature_names = pipeline.named_steps['tfidf'].get_feature_names_out()

# Get the cluster centers
cluster_centers = pipeline.named_steps['kmeans'].cluster_centers_

# Get the top n features for each cluster
n = 10
for i, center in enumerate(cluster_centers):
    print(f"Cluster {i}:")
    top_features_idx = center.argsort()[::-1][:n]
    top_features = [feature_names[idx] for idx in top_features_idx]
    print(top_features)
'''

# Loading and Logging

In [None]:
def load(targetfile,data_to_load):
    data_to_load.to_csv(targetfile)

In [None]:
from datetime import datetime
def log(message):
    timestamp_format = '%H:%M:%S-%h-%d-%Y'
    #Hour-Minute-Second-MonthName-Day-Year
    now = datetime.now() # get current timestamp
    timestamp = now.strftime(timestamp_format)
    with open("watch_reviews.txt","a") as f: f.write(timestamp + ',' + message + 'n')

# Running ETL Process

In [None]:
log("ETL Job Started")

In [None]:
log("Extract phase Started")
extracted_data = extract()
log("Extract phase Ended")

  extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)
  extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)


  dataframe = pd.read_csv(file_to_process, sep='\t', error_bad_lines=False)
  extracted_data = extracted_data.append(extract_from_tsv(csvfile), ignore_index=True)


  dataframe = pd.read_csv(file_to_process, sep='\t', error_bad_lines=False)
  extracted_data = extracted_data.append(extract_from_tsv(csvfile), ignore_index=True)


In [None]:
transformed_data = transform(extracted_data)
log("Transform phase Ended")

In [None]:
log("Load phase Started")
load(targetfile,transformed_data)
log("Load phase Ended")

In [None]:
log("ETL Job Ended")