In [3]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
from scipy import stats
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.model_selection import KFold, cross_val_score 


In [4]:
# List of CSV files
csv_files = [
    'data/C100/c100_SEC01.csv', 'data/C100/c100_SEC02.csv', 'data/C100/c100_SEC04.csv',
    'data/C100/c100_SEC05.csv', 'data/C100/c100_SEC06.csv', 'data/C100/c100_SEC07.csv',
    'data/C100/c100_SEC08.csv', 'data/C100/c100_SEC09.csv', 'data/C100/c100_SEC10.csv',
    'data/C100/c100_SEC11.csv', 'data/C100/c100_SEC12.csv', 'data/C100/c100_SEC13.csv',
    'data/C100/c100_SEC14.csv', 'data/C100/c100_SEC15.csv', 'data/C100/c100_SEC16.csv',
    'data/C100/c100_SEC17.csv', 'data/C100/c100_SEC18.csv', 'data/C100/c100_SEC19.csv'
]

# Initialize an empty list to store DataFrames
dataframes = []

# Read the first CSV file to get the correct column headers
initial_df = pd.read_csv(csv_files[0])
correct_columns = initial_df.columns

# Process the first file
dataframes.append(initial_df)
print(f"Read {csv_files[0]} successfully with {len(initial_df)} rows.")

# Loop through the remaining files
for file in csv_files[1:]:
    try:
        # Read the current CSV file into a DataFrame
        df = pd.read_csv(file)
        
        # Rename columns to match the correct columns
        df.columns = correct_columns
        
        print(f"Read {file} successfully with {len(df)} rows.")
        
        # Drop rows where all cells are blank
        df.dropna(how='all', inplace=True)
        
        # Append the DataFrame to the list
        dataframes.append(df)
        
    except Exception as e:
        print(f"Failed to read {file}: {e}")

# Check if there are any DataFrames to concatenate
if dataframes:
    # Concatenate all DataFrames in the list into a single DataFrame
    c100_df = pd.concat(dataframes, ignore_index=True)
    
    # Save the compiled DataFrame
    c100_df.to_csv('cleaned_data/c100_compiled_dataframe.csv', index=False)
    print("Data compiled and saved successfully.")
    print(f"Total rows in compiled dataframe: {len(c100_df)}")
else:
    print("No dataframes to concatenate.")

Read data/C100/c100_SEC01.csv successfully with 59 rows.
Read data/C100/c100_SEC02.csv successfully with 59 rows.
Read data/C100/c100_SEC04.csv successfully with 59 rows.
Read data/C100/c100_SEC05.csv successfully with 59 rows.
Read data/C100/c100_SEC06.csv successfully with 60 rows.
Read data/C100/c100_SEC07.csv successfully with 58 rows.
Read data/C100/c100_SEC08.csv successfully with 59 rows.
Read data/C100/c100_SEC09.csv successfully with 59 rows.
Read data/C100/c100_SEC10.csv successfully with 58 rows.
Read data/C100/c100_SEC11.csv successfully with 59 rows.
Read data/C100/c100_SEC12.csv successfully with 58 rows.
Read data/C100/c100_SEC13.csv successfully with 59 rows.
Read data/C100/c100_SEC14.csv successfully with 59 rows.
Read data/C100/c100_SEC15.csv successfully with 60 rows.
Read data/C100/c100_SEC16.csv successfully with 59 rows.
Read data/C100/c100_SEC17.csv successfully with 59 rows.
Read data/C100/c100_SEC18.csv successfully with 59 rows.
Read data/C100/c100_SEC19.csv s