In [1]:
import pandas as pd
import numpy as np
import os
from typing import List, Dict, Optional
import inspect

In [2]:
pd.set_option('display.max_rows', 400)
pd.set_option('display.max_columns', 100)

In [3]:
def show_df_info(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prints information about a DataFrame, including column names, data types, and non-null counts.
    
    Args:
        df (pandas.DataFrame): The DataFrame to process.
        
    Returns:
        pandas.DataFrame: A DataFrame containing the column names, data types, and non-null counts, and null counts of the input DataFrame.
    """
    col_names = df.columns.to_list()
    col_dtypes = df.dtypes.to_list()
    non_null_counts = df.count().to_list()
    null_counts = df.isnull().sum().to_list()
    info_df = pd.DataFrame({'column_name': col_names, 'dtype': col_dtypes, 'non_null_count': non_null_counts, 'null_count': null_counts})

    caller_frame = inspect.currentframe().f_back
    df_name = [var_name for var_name, var_val in caller_frame.f_locals.items() if var_val is df][0]

    print(f"DataFrame '{df_name}' has {len(df)} rows and {len(df.columns)} columns.")
    print("Here is a summary of the column names, data types and null counts:")
    return info_df

In [4]:
# FUNCTIONS

In [5]:
def assemble_csv(in_dir: str) -> pd.DataFrame:
    """
    Reads all CSV files from the specified directory and loads them into a Pandas DataFrame.

    Args:
        in_dir (str): The input directory containing the CSV files.

    Returns:
        A Pandas DataFrame containing the concatenated data from all CSV files.
    """
    # Create an empty list to store the data from all CSV files
    df_list = list()

    # Iterate over all files in the input directory
    for filename in os.listdir(in_dir):
        if filename.endswith('.csv'):
            # Read the CSV file into a Pandas DataFrame
            df = pd.read_csv(os.path.join(in_dir, filename))

            # Append the data frame to the list
            df_list.append(df)

    # Concatenate all data frames into a single data frame
    combined_df = pd.concat(df_list)

    # Return the combined data frame
    return combined_df


In [6]:
in_dir = 'data/batches_classified'
df = assemble_csv(in_dir)
df.head(1)

Unnamed: 0,kw,rank,link,link.1,success,word_count,percent_human,percent_ai
0,how to finish concrete,1,https://www.familyhandyman.com/project/how-to-...,https://www.familyhandyman.com/project/how-to-...,True,1689.0,99.926081,0.073917


In [7]:
#df['uid'] = df['kw'] + '*' + df['link']

In [8]:
df['uid'] = df.apply(lambda row: f"{row['kw']}_{row['rank']}_{row['link']}", axis=1)

In [9]:
show_df_info(df)

DataFrame 'df' has 19990 rows and 9 columns.
Here is a summary of the column names, data types and null counts:


Unnamed: 0,column_name,dtype,non_null_count,null_count
0,kw,object,19990,0
1,rank,int64,19990,0
2,link,object,19990,0
3,link.1,object,19990,0
4,success,bool,19990,0
5,word_count,float64,16452,3538
6,percent_human,float64,16452,3538
7,percent_ai,float64,16452,3538
8,uid,object,19990,0


In [10]:
df.head(1)

Unnamed: 0,kw,rank,link,link.1,success,word_count,percent_human,percent_ai,uid
0,how to finish concrete,1,https://www.familyhandyman.com/project/how-to-...,https://www.familyhandyman.com/project/how-to-...,True,1689.0,99.926081,0.073917,how to finish concrete_1_https://www.familyhan...


In [11]:
len(df['link'])

19990

In [12]:
len(df['link'][0])

200

In [13]:
#==df['link'][0]

In [14]:
df = df.drop(['link.1'], axis=1)

In [15]:
df.head()

Unnamed: 0,kw,rank,link,success,word_count,percent_human,percent_ai,uid
0,how to finish concrete,1,https://www.familyhandyman.com/project/how-to-...,True,1689.0,99.926081,0.073917,how to finish concrete_1_https://www.familyhan...
1,how to finish concrete,2,https://youtube.com/watch?v=365f0QV-aOs,False,,,,how to finish concrete_2_https://youtube.com/w...
2,how to finish concrete,3,https://youtube.com/watch?v=6SFo4zYwvT4,False,,,,how to finish concrete_3_https://youtube.com/w...
3,how to finish concrete,4,https://www.wikihow.com/Finish-Concrete,True,324.0,88.443929,11.556073,how to finish concrete_4_https://www.wikihow.c...
4,how to finish concrete,5,https://www.instructables.com/How-to-Finish-Co...,True,1417.0,94.22247,5.777531,how to finish concrete_5_https://www.instructa...


In [16]:
non_null_count = df['word_count'].count()
print("Number of non-null values in 'word_count':", non_null_count)

Number of non-null values in 'word_count': 16452


In [17]:
true_count = df['success'].sum()
print("Number of True values in 'success':", true_count)

Number of True values in 'success': 16452


In [18]:
true_count = (df['success'] == False).sum()
true_percent = true_count / len(df) * 100
true_percent

17.698849424712357

In [19]:
class_zero = ((df['percent_human']== 0) & (df['percent_ai'] == 0)).sum()
class_zero

1815

In [20]:
class_zero = ((df['percent_human']== 0) & (df['percent_ai'] == 0)).sum()
class_nan = ((df['percent_human'].isna()) | (df['percent_ai'].isna())).sum()
x = len(df) - class_zero - class_nan
x

14637

In [21]:
class_nonzero = ((df['percent_human']== 0) & (df['percent_ai'] == 0) == False).sum()
class_nonzero

18175

In [22]:
def summarize_results(df):
    success_true = (df['success'] == True).sum()
    success_false = (df['success'] == False).sum()
    class_zero = ((df['percent_human']== 0) & (df['percent_ai'] == 0)).sum()
    class_nonzero = ((df['percent_human']!= 0) & (df['percent_ai'] != 0)).sum()
    class_nan = ((df['percent_human'].isna()) | (df['percent_ai'].isna())).sum()
    print(f'success_true {success_true}')
    print(f'success_false {success_false}')
    print(f'class_zero {class_zero}')
    print(f'class_nonzero {class_nonzero}')
    print(f'class_nan {class_nan}')
    print(f'success_true + success_false + class_zero {success_true + success_false + class_zero}')
    print(f'df len {len(df)}')
    print(f'class_zero + class_nonzero + class_nan {class_zero + class_nonzero + class_nan}')
    print(f'len(df) - class_zero - class_nan {len(df) - class_zero - class_nan }')

In [23]:
summarize_results(df)

success_true 16452
success_false 3538
class_zero 1815
class_nonzero 18047
class_nan 3538
success_true + success_false + class_zero 21805
df len 19990
class_zero + class_nonzero + class_nan 23400
len(df) - class_zero - class_nan 14637


In [24]:
# TEXT TOO SHORT

count = ((df['success'] == True) & (df['percent_human'] == 0) & (df['percent_ai'] == 0)).sum()
print("Number of rows where 'success' is True and 'percent_human' and 'percent_ai' are both 0:", count)
print ('')


Number of rows where 'success' is True and 'percent_human' and 'percent_ai' are both 0: 1815



In [25]:
# WHERE % HUMAN AND % AI ARE BOTH 0 THE WORD_COUNT WAS TOO SHORT
short_df = df[(df['success'] == True) & (df['percent_human'] == 0) & (df['percent_ai'] == 0)]
short_df

Unnamed: 0,kw,rank,link,success,word_count,percent_human,percent_ai,uid
16,how to finish concrete,17,https://www.forconstructionpros.com/concrete/a...,True,99.0,0.0,0.0,how to finish concrete_17_https://www.forconst...
22,saline spray for piercings,3,https://www.walmart.com/c/kp/saline-solutions-...,True,8.0,0.0,0.0,saline spray for piercings_3_https://www.walma...
23,saline spray for piercings,4,https://www.walmart.com/ip/Piercing-Aftercare-...,True,17.0,0.0,0.0,saline spray for piercings_4_https://www.walma...
36,saline spray for piercings,17,https://modify.net.nz/product/piercing-afterca...,True,44.0,0.0,0.0,saline spray for piercings_17_https://modify.n...
43,early stage blood clot in foot pictures,4,https://www.gettyimages.com/photos/blood-clot-leg,True,78.0,0.0,0.0,early stage blood clot in foot pictures_4_http...
...,...,...,...,...,...,...,...,...
77,what is a three way switch,18,https://storage.heightslibrary.org/materials/h...,True,58.0,0.0,0.0,what is a three way switch_18_https://storage....
82,ruched midi dress,3,https://www.dillards.com/search-term/ruched+dr...,True,70.0,0.0,0.0,ruched midi dress_3_https://www.dillards.com/s...
84,ruched midi dress,5,https://www.showpo.com/us/dresses/ruched-dresses/,True,60.0,0.0,0.0,ruched midi dress_5_https://www.showpo.com/us/...
94,ruched midi dress,15,https://www.revolve.com/mobile/superdown-brynn...,True,16.0,0.0,0.0,ruched midi dress_15_https://www.revolve.com/m...


In [26]:
# 

data_df = df[(df['percent_human'] + df['percent_ai'] >= 99)]
data_df

Unnamed: 0,kw,rank,link,success,word_count,percent_human,percent_ai,uid
0,how to finish concrete,1,https://www.familyhandyman.com/project/how-to-...,True,1689.0,99.926081,0.073917,how to finish concrete_1_https://www.familyhan...
3,how to finish concrete,4,https://www.wikihow.com/Finish-Concrete,True,324.0,88.443929,11.556073,how to finish concrete_4_https://www.wikihow.c...
4,how to finish concrete,5,https://www.instructables.com/How-to-Finish-Co...,True,1417.0,94.222470,5.777531,how to finish concrete_5_https://www.instructa...
5,how to finish concrete,6,https://carrollsbuildingmaterials.com/diy-conc...,True,1600.0,99.730074,0.269922,how to finish concrete_6_https://carrollsbuild...
6,how to finish concrete,7,https://www.hunker.com/13402242/how-to-finish-...,True,738.0,97.359934,2.640062,how to finish concrete_7_https://www.hunker.co...
...,...,...,...,...,...,...,...,...
93,ruched midi dress,14,https://www.rihoas.com/collections/ruched-dresses,True,184.0,96.125513,3.874488,ruched midi dress_14_https://www.rihoas.com/co...
95,ruched midi dress,16,https://www.fashionnova.com/products/my-man-an...,True,1774.0,63.200365,36.799634,ruched midi dress_16_https://www.fashionnova.c...
96,ruched midi dress,17,https://www.fashionnova.com/products/my-man-an...,True,1774.0,63.163045,36.836954,ruched midi dress_17_https://www.fashionnova.c...
98,ruched midi dress,19,https://www.loft.com/petites/petite-dresses/ca...,True,1602.0,92.680357,7.319643,ruched midi dress_19_https://www.loft.com/peti...


In [27]:
# CALCULATE PERCENT
new_df = df[(df['percent_human'] + df['percent_ai'] >= 99)]
percentage = round((len(new_df) / len(df)) * 100, 1)
print("Percentage of rows where the sum of 'percent_human' and 'percent_ai' is 99 or greater:", percentage, "%")


Percentage of rows where the sum of 'percent_human' and 'percent_ai' is 99 or greater: 73.2 %


In [28]:
def count_occurrences(df):
    result = df.groupby('kw').apply(lambda x: ((x['percent_human'] + x['percent_ai']) >= 99).sum()).reset_index(name='count')
    avg_values = df.groupby('kw').agg({'percent_human': [lambda x: np.nan if x.isnull().all() or (x == 0).all() else x.min(),
                                                       lambda x: np.nan if x.isnull().all() or (x == 0).all() else x.mean(),
                                                       lambda x: np.nan if x.isnull().all() or (x == 0).all() else x.max()],
                                       'percent_ai': [lambda x: np.nan if x.isnull().all() or (x == 0).all() else x.min(),
                                                      lambda x: np.nan if x.isnull().all() or (x == 0).all() else x.mean(),
                                                      lambda x: np.nan if x.isnull().all() or (x == 0).all() else x.max()]}).reset_index()
    avg_values.columns = ['kw', '%_Human_Min', '%_Human_Mean', '%_Human_Max', '%_AI_Min', '%_AI_Mean', '%_AI_max']
    avg_values = avg_values.round({'%_Human_Min': 1, '%_Human_Mean': 1, '%_Human_Max': 1, '%_AI_Min': 1, '%_AI_Mean': 1, '%_AI_max': 1})
    result = pd.merge(result, avg_values, on='kw')
    return result

In [29]:
z = count_occurrences(df)
z

Unnamed: 0,kw,count,%_Human_Min,%_Human_Mean,%_Human_Max,%_AI_Min,%_AI_Mean,%_AI_max
0,/lh meaning,15,0.0,63.6,99.9,0.0,15.3,56.5
1,2 pole breaker,8,0.0,57.5,100.0,0.0,15.3,74.8
2,205/75r14 trailer tire,7,0.0,42.2,97.5,0.0,21.4,78.5
3,21 lessons for the 21st century,11,0.0,77.6,100.0,0.0,7.1,42.8
4,3 subject notebook,8,0.0,39.5,100.0,0.0,10.5,70.6
...,...,...,...,...,...,...,...,...
995,yellow slip dress,8,0.0,36.5,99.0,0.0,20.7,72.4
996,yoni pearls,16,0.2,78.1,100.0,0.0,21.9,99.8
997,yor forger cosplay,13,0.0,47.6,99.9,0.0,28.8,83.7
998,zinus bed frame,11,0.0,66.8,99.8,0.0,11.8,60.9


In [30]:
z.to_csv('preliminary_results.csv', index=False)

In [31]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14637 entries, 0 to 99
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kw             14637 non-null  object 
 1   rank           14637 non-null  int64  
 2   link           14637 non-null  object 
 3   success        14637 non-null  bool   
 4   word_count     14637 non-null  float64
 5   percent_human  14637 non-null  float64
 6   percent_ai     14637 non-null  float64
 7   uid            14637 non-null  object 
dtypes: bool(1), float64(3), int64(1), object(3)
memory usage: 929.1+ KB


In [32]:
data_df.to_csv('data/data_clean.csv', index=False)