### Purpose:
The purpose of this notebook is to integrate readibility API with the Correlation Study data

### Dependancies:

In [25]:
import requests
import re
import os
import csv
import json
import time
from IPython.display import clear_output
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
from typing import Tuple, List, Dict, Optional
import re
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 100)

### Functions:

In [26]:
def get_json_data(directory):
    """
    Recursively flattens all dictionaries in a given directory.
    """
    flattened_list = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath, "r") as f:
            try:
                d = json.load(f)
                flattened = flatten_dict(d)
                flattened_list.append(flattened)
            except json.JSONDecodeError:
                # Skip over any files that contain invalid JSON data
                print(f"Skipping file {filename}: invalid JSON data")
                pass
    return flattened_list

In [27]:
def flatten_dict(d):
    """
    Recursively flattens a dictionary with nested keys.
    """
    items = {}
    for key, value in d.items():
        #try:
        #if isinstance(value, dict):
        if type(value) == dict:
            flattened = flatten_dict(value)
            for subkey, subvalue in flattened.items():
                items[subkey] = subvalue
        else:
            items[key] = value
        #except:
        #    print("somthing bad happened")
    
    new_dict = {k: v for k, v in items.items()}
    return new_dict

In [28]:
def get_duplicates(df: pd.DataFrame, col_name: str ) -> pd.DataFrame:
    """
    This function takes a Pandas DataFrame and a column name and returns a new DataFrame
    that contains only the rows that have duplicated values in the specified column.
    It also prints the number of records checked, removed, and remaining.

    Parameters:
    - df: the input DataFrame
    - col_name: the name of the column to check for duplicates

    Returns:
    - A new DataFrame that contains only the rows with duplicated values in the specified column.
    """
    count_original = len(df)
    # Use pandas duplicated() method to find the rows with duplicated values in the specified column
    # The keep=False argument keeps all duplicated rows (not just the first occurrence)
    # The copy() method creates a deep copy of the resulting DataFrame to avoid modifying the original one
    df = df[df[col_name].duplicated(keep=False)].copy(deep=True)
    count_final = len(df)
    count_removed = count_original - count_final
    # Print some summary statistics
    print(f"Extracting duplicates from DataFrame:")
    print(f"Records checked: {count_original}")
    print(f"Records removed: {count_removed}")
    print(f"Output df contains {count_final} duplicated records")
    return df

In [29]:
def eval_duplicates(df: pd.DataFrame, col_1: str, col_2: str) -> pd.DataFrame:
    """
    This function takes a Pandas DataFrame and two column names and checks if each non-unique
    value in col_1 has identical values in col_2. It returns a new DataFrame that contains only
    the rows where this condition is not met.

    Parameters:
    - df: the input DataFrame
    - col_1: the name of the first column to check for non-unique values
    - col_2: the name of the second column to check for identical values

    Returns:
    - A new DataFrame that contains only the rows where a non-unique value in col_1 does not have
    identical values in col_2.
    """
    # Get a boolean mask that indicates which values in col_1 are non-unique
    mask = df[col_1].duplicated(keep=False)
    # Use groupby() to group the DataFrame by the non-unique values in col_1
    # Then use transform() to apply a function that checks if the values in col_2 are identical within each group
    identical_values_mask = df.groupby(col_1)[col_2].transform(lambda x: len(set(x)) == 1)
    # Combine the two masks using the & operator to get a final mask
    # that indicates which rows to keep (False means remove the row)
    final_mask = ~mask | identical_values_mask
    # Return a new DataFrame that contains only the rows where the final mask is True
    return df[final_mask]

### Code Execution:

In [30]:
data = get_json_data('data/json/')
len(data)

13588

In [31]:
df1 = pd.DataFrame(data)
df1.tail(1)

Unnamed: 0,link,text,smogScore,colemanScore,automatedReadabilityScore,daleScore,powersSumnerKearlScore,forcastScore,spacheScore,gunningFogScore,fleschGrade,fleschScore,fleschGrade'
13587,https://www.amazon.com/Universal-Washing-Machi...,Amazon.com Enter the characters you see below ...,11.0,10.6,4.5,5.7,2.5,16.7,3.3,7.3,6.1,68.1,


In [32]:
df2 = pd.read_csv('data/data_clean_final.csv')
df2.tail(1)

Unnamed: 0,kw,rank,link,success,word_count,percent_human,percent_ai,uid,Adwords bottom,Adwords sitelink,Adwords top,Featured snippet,Image pack,Knowledge card,Knowledge panel,Local pack,Local teaser,People also ask,Shopping results,Sitelinks,Thumbnail,Top stories,Tweet box,Video preview,Videos
14636,ruched midi dress,20,https://www.loft.com/petites/petite-dresses/ca...,True,1602.0,92.680357,7.319643,ruched midi dress_20_https://www.loft.com/peti...,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0


In [33]:
df = pd.merge(df1, df2, on='link', how='inner')
df.tail(1)

Unnamed: 0,link,text,smogScore,colemanScore,automatedReadabilityScore,daleScore,powersSumnerKearlScore,forcastScore,spacheScore,gunningFogScore,fleschGrade,fleschScore,fleschGrade',kw,rank,success,word_count,percent_human,percent_ai,uid,Adwords bottom,Adwords sitelink,Adwords top,Featured snippet,Image pack,Knowledge card,Knowledge panel,Local pack,Local teaser,People also ask,Shopping results,Sitelinks,Thumbnail,Top stories,Tweet box,Video preview,Videos
14156,https://www.amazon.com/Universal-Washing-Machi...,Amazon.com Enter the characters you see below ...,11.0,10.6,4.5,5.7,2.5,16.7,3.3,7.3,6.1,68.1,,drain hose washing machine,1,True,775.0,83.3,16.7,drain hose washing machine_1_https://www.amazo...,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0


In [34]:
dup = get_duplicates(df, 'link').copy(deep=True)
dup.tail(1)

Extracting duplicates from DataFrame:
Records checked: 14157
Records removed: 13027
Output df contains 1130 duplicated records


Unnamed: 0,link,text,smogScore,colemanScore,automatedReadabilityScore,daleScore,powersSumnerKearlScore,forcastScore,spacheScore,gunningFogScore,fleschGrade,fleschScore,fleschGrade',kw,rank,success,word_count,percent_human,percent_ai,uid,Adwords bottom,Adwords sitelink,Adwords top,Featured snippet,Image pack,Knowledge card,Knowledge panel,Local pack,Local teaser,People also ask,Shopping results,Sitelinks,Thumbnail,Top stories,Tweet box,Video preview,Videos
14133,https://www.popularmechanics.com/technology/ga...,Best Vacuum Cleaners 2023 | Home Appliance Rev...,12.0,12.0,12.0,7.2,12.4,12.1,5.0,14.2,12.0,37.2,,best vaccuum,17,True,2358.0,85.297084,14.702914,best vaccuum_17_https://www.popularmechanics.c...,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0


In [35]:
x = eval_duplicates(dup, 'link', 'text')
x.tail(1)

Unnamed: 0,link,text,smogScore,colemanScore,automatedReadabilityScore,daleScore,powersSumnerKearlScore,forcastScore,spacheScore,gunningFogScore,fleschGrade,fleschScore,fleschGrade',kw,rank,success,word_count,percent_human,percent_ai,uid,Adwords bottom,Adwords sitelink,Adwords top,Featured snippet,Image pack,Knowledge card,Knowledge panel,Local pack,Local teaser,People also ask,Shopping results,Sitelinks,Thumbnail,Top stories,Tweet box,Video preview,Videos
14133,https://www.popularmechanics.com/technology/ga...,Best Vacuum Cleaners 2023 | Home Appliance Rev...,12.0,12.0,12.0,7.2,12.4,12.1,5.0,14.2,12.0,37.2,,best vaccuum,17,True,2358.0,85.297084,14.702914,best vaccuum_17_https://www.popularmechanics.c...,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0
