In [25]:
import os
import pandas as pd

# specify the path to the folder containing the CSV files
folder_path = './data/'

# create an empty list to store the DataFrames
dfs = []
file_titles = []

# loop through each file in the folder
for file_name in os.listdir(folder_path):
    # check if the file is a CSV file
    if file_name.endswith('.csv'):
        # read the CSV file using pandas
        file_path = os.path.join(folder_path, file_name)
        data = pd.read_csv(file_path)
        # add the DataFrame to the list
        dfs.append(data)
        # add the file title to the list
        file_titles.append(file_name)

# concatenate all the DataFrames into a single DataFrame
df = pd.concat(dfs, axis=1)

# create a dictionary to store the summary data for each feature
summary_dict = {}

# loop through each column in the DataFrame
for col in df.columns:
    # get the data type
    data_type = df[col].dtype
    # get the number of null values
    null_count = df[col].isnull().sum()
    # get the number of unique values
    unique_count = df[col].nunique()
    # get the first and second values
    first_value = df[col].iloc[0]
    second_value = df[col].iloc[1]
    # get the file title
    file_title = file_titles[df.columns.get_loc(col) // len(data.columns)]
    # add the summary data to the dictionary
    summary_dict[col] = {
        'file_title': file_title,
        'data_type': data_type,
        'null_count': null_count,
        'unique_count': unique_count,
        'first_value': first_value,
        'second_value': second_value
    }

# create a DataFrame from the summary dictionary
summary_df = pd.DataFrame.from_dict(summary_dict, orient='index')

# print the summary table
summary_df.reset_index(inplace=True)

In [26]:
summary_df

Unnamed: 0,index,file_title,data_type,null_count,unique_count,first_value,second_value
0,a,A.csv,int64,0,3,1,2
1,v,A.csv,int64,0,3,1,2
2,b,A.csv,int64,0,3,1,2
3,s,A.csv,int64,0,3,1,2
4,w,A.csv,int64,0,3,1,2
5,e,A.csv,int64,0,3,1,2
6,asd,A.csv,int64,0,3,456,789
7,qwe,A.csv,int64,0,3,456,789
8,zxc,A.csv,int64,0,3,456,789
9,dfg,A.csv,int64,0,3,456,789


In [28]:
import os
import pandas as pd

# specify the path to the folder containing the CSV files
folder_path = './data/'

# create an empty list to store the DataFrames
dfs = []
file_titles = []

# loop through each file in the folder
for file_name in os.listdir(folder_path):
    # check if the file is a CSV file
    if file_name.endswith('.csv'):
        # read the CSV file using pandas
        file_path = os.path.join(folder_path, file_name)
        data = pd.read_csv(file_path)
        # add the DataFrame to the list
        dfs.append(data)
        # add the file title to the list without the .csv extension
        file_titles.append(os.path.splitext(file_name)[0])

# concatenate all the DataFrames into a single DataFrame
df = pd.concat(dfs, axis=1)

# create a dictionary to store the summary data for each feature
summary_dict = {}

# loop through each column in the DataFrame
for col in df.columns:
    # get the data type
    data_type = df[col].dtype
    # get the number of null values
    null_count = df[col].isnull().sum()
    # get the number of unique values
    unique_count = df[col].nunique()
    # get the first and second values
    first_value = df[col].iloc[0]
    second_value = df[col].iloc[1]
    # get the file title
    file_title = file_titles[df.columns.get_loc(col) // len(data.columns)]
    # add the summary data to the dictionary
    summary_dict[col] = {
        'file_title': file_title,
        'data_type': data_type,
        'null_count': null_count,
        'unique_count': unique_count,
        'first_value': first_value,
        'second_value': second_value
    }

# create a DataFrame from the summary dictionary
summary_df = pd.DataFrame.from_dict(summary_dict, orient='index')

# print the summary table
print(summary_df)


      file_title data_type  null_count  unique_count  first_value  \
a              A     int64           0             3            1   
v              A     int64           0             3            1   
b              A     int64           0             3            1   
s              A     int64           0             3            1   
w              A     int64           0             3            1   
e              A     int64           0             3            1   
asd            A     int64           0             3          456   
qwe            A     int64           0             3          456   
zxc            A     int64           0             3          456   
dfg            A     int64           0             3          456   
cvb            b     int64           0             3          456   
sdf            b     int64           0             3          456   
wer            b     int64           0             3          456   
ert            b     int64        

In [29]:
summary_df

Unnamed: 0,file_title,data_type,null_count,unique_count,first_value,second_value
a,A,int64,0,3,1,2
v,A,int64,0,3,1,2
b,A,int64,0,3,1,2
s,A,int64,0,3,1,2
w,A,int64,0,3,1,2
e,A,int64,0,3,1,2
asd,A,int64,0,3,456,789
qwe,A,int64,0,3,456,789
zxc,A,int64,0,3,456,789
dfg,A,int64,0,3,456,789


In [32]:
import os
import pandas as pd

# specify the path to the folder containing the CSV files
folder_path = './data/'

# create an empty list to store the DataFrames
dfs = []
file_titles = []

# loop through each file in the folder
for file_name in os.listdir(folder_path):
    # check if the file is a CSV file
    if file_name.endswith('.csv'):
        # read the CSV file using pandas
        file_path = os.path.join(folder_path, file_name)
        data = pd.read_csv(file_path)
        # add the DataFrame to the list
        dfs.append(data)
        # add the file title to the list without the .csv extension
        file_titles.append(os.path.splitext(file_name)[0])

# concatenate all the DataFrames into a single DataFrame
df = pd.concat(dfs, axis=1)

# create a dictionary to store the summary data for each feature
summary_dict = {}

# loop through each column in the DataFrame
for col in df.columns:
    # get the data type
    data_type = df[col].dtype
    # get the number of null values
    null_count = df[col].isnull().sum()
    # get the number of unique values
    unique_count = df[col].nunique()
    # get the first and second values
    first_value = df[col].iloc[0]
    second_value = df[col].iloc[1]
    # get the file title
    file_title = file_titles[df.columns.get_loc(col) // len(data.columns)]
    # add the summary data to the dictionary
    summary_dict[col] = {
        'feature_name': col,
        'file_title': file_title,
        'data_type': data_type,
        'null_count': null_count,
        'unique_count': unique_count,
        'first_value': first_value,
        'second_value': second_value
    }

# create a DataFrame from the summary dictionary
summary_df = pd.DataFrame.from_dict(summary_dict, orient='index')

# reset the index and reorder the columns
summary_df = summary_df.reset_index().rename(columns={'index': 'column_name'})
summary_df = summary_df[['file_title','feature_name', 'data_type', 'null_count', 'unique_count', 'first_value', 'second_value']]

# print the summary table
summary_df

Unnamed: 0,file_title,feature_name,data_type,null_count,unique_count,first_value,second_value
0,A,a,int64,0,3,1,2
1,A,v,int64,0,3,1,2
2,A,b,int64,0,3,1,2
3,A,s,int64,0,3,1,2
4,A,w,int64,0,3,1,2
5,A,e,int64,0,3,1,2
6,A,asd,int64,0,3,456,789
7,A,qwe,int64,0,3,456,789
8,A,zxc,int64,0,3,456,789
9,A,dfg,int64,0,3,456,789
