### Purpose:
The purpose of this notebook is to integrate readibility API with the Correlation Study data

### Dependancies:

In [480]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
from typing import Tuple, List, Dict, Optional
import re
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 100)

### Functions:

In [481]:
def flatten_dict(d):
    """
    Recursively flattens a dictionary with nested keys.
    """
    items = {}
    for key, value in d.items():
        #try:
        #if isinstance(value, dict):
        if type(value) == dict:
            flattened = flatten_dict(value)
            for subkey, subvalue in flattened.items():
                items[subkey] = subvalue
        else:
            items[key] = value
        #except:
        #    print("somthing bad happened")
    
    new_dict = {k: v for k, v in items.items()}
    return new_dict

In [482]:
def get_json_data(directory):
    """
    Recursively flattens all dictionaries in a given directory.
    """
    flattened_list = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath, "r") as f:
            try:
                d = json.load(f)
                flattened = flatten_dict(d)
                flattened_list.append(flattened)
            except json.JSONDecodeError:
                # Skip over any files that contain invalid JSON data
                print(f"Skipping file {filename}: invalid JSON data")
                pass
    return flattened_list

In [483]:
def get_dup(df: pd.DataFrame, col_name: str ) -> pd.DataFrame:
    """
    This function takes a Pandas DataFrame and a column name and returns a new DataFrame
    that contains only the rows that have duplicated values in the specified column.
    It also prints the number of records checked, removed, and remaining.

    Parameters:
    - df: the input DataFrame
    - col_name: the name of the column to check for duplicates

    Returns:
    - A new DataFrame that contains only the rows with duplicated values in the specified column.
    """
    count_original = len(df)
    # Use pandas duplicated() method to find the rows with duplicated values in the specified column
    # The keep=False argument keeps all duplicated rows (not just the first occurrence)
    # The copy() method creates a deep copy of the resulting DataFrame to avoid modifying the original one
    df = df[df[col_name].duplicated(keep=False)].copy(deep=True)
    count_final = len(df)
    count_removed = count_original - count_final
    # Print some summary statistics
    print(f"Copying records where column `{col_name}` has duplicates from DataFrame:")
    print(f"Records checked: {count_original}")
    print(f"Records removed: {count_removed}")
    print(f"Output df column `{col_name}` contains {count_final} duplicated records")
    return df

In [484]:
def remove_dup(df: pd.DataFrame, col_name: str ) -> pd.DataFrame:
    """
    This function takes a Pandas DataFrame and a column name and returns a new DataFrame
    with all duplicated values in the specified column removed.
    It also prints the number of records checked, removed, and remaining.

    Parameters:
    - df: the input DataFrame
    - col_name: the name of the column to check for duplicates

    Returns:
    - A new DataFrame that contains only the rows with duplicated values in the specified column.
    """
    count_original = len(df)
    # Use pandas duplicated() method to find the rows with duplicated values in the specified column
    # The keep=False argument keeps all duplicated rows (not just the first occurrence)
    # The copy() method creates a deep copy of the resulting DataFrame to avoid modifying the original one
    df = df[~df[col_name].duplicated(keep=False)].copy(deep=True)
    count_final = len(df)
    count_removed = count_original - count_final
    # Print some summary statistics
    print(f"Removing records where column `{col_name}` has duplicates from DataFrame:")
    print(f"Records checked: {count_original}")
    print(f"Records removed: {count_removed}")
    print(f"Output df column `{col_name}` contains {count_final} non duplicated records")
    return df

### Code Execution:

In [485]:
# Laod Readability-Rank Correlation Study (RRCS) Data
data = get_json_data('data/json/')
len(data)

13588

In [486]:
df_rr = pd.DataFrame(data)
df_rr.tail(1)

Unnamed: 0,link,text,smogScore,colemanScore,automatedReadabilityScore,daleScore,powersSumnerKearlScore,forcastScore,spacheScore,gunningFogScore,fleschGrade,fleschScore,fleschGrade'
13587,https://www.amazon.com/Universal-Washing-Machi...,Amazon.com Enter the characters you see below ...,11.0,10.6,4.5,5.7,2.5,16.7,3.3,7.3,6.1,68.1,


In [487]:
# Load AI-Content-Rank Correlation Study (ACRCS) Data
df_acr = pd.read_csv('data/data_clean_final.csv')
df_acr = df_acr.drop(['Adwords bottom', 'Adwords sitelink', 'Adwords top', 'Featured snippet', 'Image pack', 'Knowledge card', 'Knowledge panel', 'Local pack', 'Local teaser', 'People also ask', 'Shopping results', 'Sitelinks', 'Thumbnail', 'Top stories', 'Tweet box', 'Video preview', 'Videos'], axis=1)
df_acr.tail(1)

Unnamed: 0,kw,rank,link,success,word_count,percent_human,percent_ai,uid
14636,ruched midi dress,20,https://www.loft.com/petites/petite-dresses/ca...,True,1602.0,92.680357,7.319643,ruched midi dress_20_https://www.loft.com/peti...


In [488]:
# merge datasets into one
df_full = pd.merge(df_rr, df_acr, on='link', how='inner').copy(deep=True)
df_full.tail(1)
df_full = df_full.rename(columns={
    'smogScore': 'smog',
    'colemanScore': 'cole',
    'automatedReadabilityScore': 'auto',
    'daleScore': 'dale',
    'powersSumnerKearlScore': 'pows',
    'forcastScore': 'forc',
    'spacheScore': 'spac',
    'gunningFogScore': 'gunn',
    'fleschGrade': 'fleG1',
    'fleschScore': 'fleS',
    'fleschGrade\'': 'fleG2',
    'kw': 'kw',
    'rank': 'rank',
    'success': 'success',
    'word_count': 'word_count',
    'percent_human': 'hcs',
    'percent_ai': 'aics'
})
df_full.tail(1)

Unnamed: 0,link,text,smog,cole,auto,dale,pows,forc,spac,gunn,fleG1,fleS,fleG2,kw,rank,success,word_count,hcs,aics,uid
14156,https://www.amazon.com/Universal-Washing-Machi...,Amazon.com Enter the characters you see below ...,11.0,10.6,4.5,5.7,2.5,16.7,3.3,7.3,6.1,68.1,,drain hose washing machine,1,True,775.0,83.3,16.7,drain hose washing machine_1_https://www.amazo...


In [489]:
# check `success` column to see if any are False
count = df_full[df_full['success'] == False].shape[0]
print(f"Number of records with success == False: {count}")
# it is 0 so can be removed

Number of records with success == False: 0


In [490]:
df_full = df_full.drop(['uid', 'kw', 'success'], axis=1)

In [491]:
#fg2 identfied as a problem
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14157 entries, 0 to 14156
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   link        14157 non-null  object 
 1   text        14157 non-null  object 
 2   smog        14157 non-null  float64
 3   cole        14157 non-null  float64
 4   auto        14157 non-null  float64
 5   dale        14157 non-null  float64
 6   pows        14157 non-null  float64
 7   forc        14157 non-null  float64
 8   spac        14157 non-null  float64
 9   gunn        14157 non-null  float64
 10  fleG1       14156 non-null  float64
 11  fleS        14157 non-null  float64
 12  fleG2       1 non-null      float64
 13  rank        14157 non-null  int64  
 14  word_count  14157 non-null  float64
 15  hcs         14157 non-null  float64
 16  aics        14157 non-null  float64
dtypes: float64(14), int64(1), object(2)
memory usage: 1.9+ MB


In [492]:
# record 5187 has `fg` in the wrong column
df_full[df_full['fleG2'].notna()]

Unnamed: 0,link,text,smog,cole,auto,dale,pows,forc,spac,gunn,fleG1,fleS,fleG2,rank,word_count,hcs,aics
5187,https://www.logitech.com/en-us/products/webcam...,"Webcams - 4K, Full HD, 1080p | Logitech Webcam...",12.0,12.0,12.0,10.0,8.9,15.1,5.0,19.0,,5.4,12.0,10,139.0,80.520546,19.479455


In [493]:
# fix misplaced record: copy `fg2` into `fg1` for that 1 column
#df_full.loc[df_full['fg2'].notna(), 'fg'] = df_full['fg2']
df_full.loc[df_full['fleG1'].isna(), 'fleG1'] = df_full['fleG2']


In [494]:
# confirm that the record is fixed
print(f"Dataset length is still {len(df_full)}")
df_full.loc[[5187]]

Dataset length is still 14157


Unnamed: 0,link,text,smog,cole,auto,dale,pows,forc,spac,gunn,fleG1,fleS,fleG2,rank,word_count,hcs,aics
5187,https://www.logitech.com/en-us/products/webcam...,"Webcams - 4K, Full HD, 1080p | Logitech Webcam...",12.0,12.0,12.0,10.0,8.9,15.1,5.0,19.0,12.0,5.4,12.0,10,139.0,80.520546,19.479455


In [495]:
# dropping fg2 and renaming `fg1` to `fg`
df_full = df_full.drop(columns=['fleG2']).rename(columns={'fleG1': 'fleG2'})
df_full.loc[[5187]]

Unnamed: 0,link,text,smog,cole,auto,dale,pows,forc,spac,gunn,fleG2,fleS,rank,word_count,hcs,aics
5187,https://www.logitech.com/en-us/products/webcam...,"Webcams - 4K, Full HD, 1080p | Logitech Webcam...",12.0,12.0,12.0,10.0,8.9,15.1,5.0,19.0,12.0,5.4,10,139.0,80.520546,19.479455


In [496]:
# drop records where 'link' has duplicates, but keeping the first of each. Should remove 576
count_old = len(df_full)
df_full = df_full.drop_duplicates(subset=['link'], keep='first')
count_new = len(df_full)
print(f"{count_old-count_new} records removed {count_new} remaining")

575 records removed 13582 remaining


In [497]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13582 entries, 0 to 14156
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   link        13582 non-null  object 
 1   text        13582 non-null  object 
 2   smog        13582 non-null  float64
 3   cole        13582 non-null  float64
 4   auto        13582 non-null  float64
 5   dale        13582 non-null  float64
 6   pows        13582 non-null  float64
 7   forc        13582 non-null  float64
 8   spac        13582 non-null  float64
 9   gunn        13582 non-null  float64
 10  fleG2       13582 non-null  float64
 11  fleS        13582 non-null  float64
 12  rank        13582 non-null  int64  
 13  word_count  13582 non-null  float64
 14  hcs         13582 non-null  float64
 15  aics        13582 non-null  float64
dtypes: float64(13), int64(1), object(2)
memory usage: 1.8+ MB


In [498]:
summary_data = {
    'Column': df_full.columns,
    'Non-Null Count': df_full.count().values,
    'Dtype': df_full.dtypes.values
}

summary_df = pd.DataFrame(summary_data)
summary_df

Unnamed: 0,Column,Non-Null Count,Dtype
0,link,13582,object
1,text,13582,object
2,smog,13582,float64
3,cole,13582,float64
4,auto,13582,float64
5,dale,13582,float64
6,pows,13582,float64
7,forc,13582,float64
8,spac,13582,float64
9,gunn,13582,float64


In [499]:
df_full = df_full.reset_index(drop=True)

In [500]:
df_full.to_csv('data/combined_final.csv', escapechar='\\', index=False)

In [501]:
df_full = df_full[['rank', 'smog', 'cole', 'auto', 'dale', 'pows', 'forc',
    'spac', 'gunn', 'fleG2', 'fleS', 'hcs', 'aics', 'word_count', 'text', 'link']]

In [502]:
df_full.tail(1)

Unnamed: 0,rank,smog,cole,auto,dale,pows,forc,spac,gunn,fleG2,fleS,hcs,aics,word_count,text,link
13581,1,11.0,10.6,4.5,5.7,2.5,16.7,3.3,7.3,6.1,68.1,83.3,16.7,775.0,Amazon.com Enter the characters you see below ...,https://www.amazon.com/Universal-Washing-Machi...
