In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gzip
import shutil
import json

if os.getcwd().split("/")[-1] == "app":
    os.chdir("..")
    print("Changed working directory to root")
    print("Current working directory: ", os.getcwd())

In [10]:
def unzip_gz(input_file, output_file, tmp_dir):
    output_file = os.path.join(tmp_dir, output_file)
    with gzip.open(input_file, 'rb') as gz_file:
        with open(output_file, 'wb') as output:
            output.write(gz_file.read())

def unzip_all_gz_files(data_dir, tmp_dir):
    if tmp_dir not in os.listdir():
        os.mkdir(tmp_dir)
    for f in os.listdir(data_dir):
        if f.endswith('.gz'):
            unzip_gz(os.path.join(data_dir, f), f[:-3], tmp_dir)

def merge_jsonsdata(data_dir,tmp_dir, output_file):
    unzip_all_gz_files(data_dir, tmp_dir)
    data = []
    for f in os.listdir(tmp_dir):
        if f.endswith('.json'):
            with open(os.path.join(tmp_dir, f), 'r') as json_file:
                for line in json_file:
                    data.append(json.loads(line))
    with open(os.path.join(data_dir, output_file), 'w') as output:
        json.dump(data, output, indent=4)
        
    remove_directory(tmp_dir)

def remove_directory(directory_path):
    try:
        shutil.rmtree(directory_path)
        print(f"Directory '{directory_path}' and its contents have been removed successfully.")
    except FileNotFoundError:
        print(f"Directory '{directory_path}' not found.")
    except PermissionError:
        print(f"Permission denied to remove directory '{directory_path}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

def zip(file_path, location, root):
    file_path = os.path.join(root, file_path)
    location = os.path.join(root, location)
    with open(file_path, 'rb') as src, gzip.open(location, 'wb') as dst:
        dst.writelines(src)
    
data_dir = 'data'
data_fname = 'data.json'
data_gzip_fname = 'data.json.gz'
data_path = os.path.join(data_dir, data_fname)
tmp_dir = 'tmp'

if data_gzip_fname not in os.listdir(data_dir):
    merge_jsonsdata(data_dir,tmp_dir, data_fname)
    zip(data_fname, data_gzip_fname, data_dir)
else:
    print(f"File '{data_path}' already exists.")

File 'data/data.json' already exists.


In [11]:
def convert_json_to_df(data_path):
    with open(data_path, 'r') as json_file:
        data = json.load(json_file)
    df = pd.DataFrame(data)
    return df

In [12]:
df = convert_json_to_df(data_path= os.path.join(data_dir, data_fname))

## Data Analysis

In [13]:
print("\nDataframe dimensions:")
print(df.shape)

print("\nDataframe summary:")
print(df.info())

print("\nDescriptive statistics:")
print(df.describe())

print("\nColumn names:")
print(df.columns)

print("\nData types of columns:")
print(df.dtypes)

print("\nNull values in the dataframe:")
print(df.isnull())

print("\nValue counts of each column:")
for col in df.columns:
    print(f"\n{col} has {df[col].value_counts()} distinct values")



Dataframe dimensions:
(497661, 12)

Dataframe summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497661 entries, 0 to 497660
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         497661 non-null  float64
 1   verified        497661 non-null  bool   
 2   reviewTime      497661 non-null  object 
 3   reviewerID      497661 non-null  object 
 4   asin            497661 non-null  object 
 5   style           236241 non-null  object 
 6   reviewerName    497619 non-null  object 
 7   reviewText      497356 non-null  object 
 8   summary         497538 non-null  object 
 9   unixReviewTime  497661 non-null  int64  
 10  vote            71312 non-null   object 
 11  image           10895 non-null   object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 42.2+ MB
None

Descriptive statistics:
             overall  unixReviewTime
count  497661.000000    4.976610e+05
mean        4