In [4]:
import os
import pandas as pd

# Directory containing the CSV files
directory = "./Wine_Stats/"

# Initialize variables
files = os.listdir(directory)
dataframes = []
rows = 0

# Process each file
for file in files:
    if file.endswith(".csv"):  # Ensure only CSV files are processed
        file_path = os.path.join(directory, file)
        try:
            df = pd.read_csv(file_path)
            rows += len(df)
            dataframes.append(df)
        except Exception as e:
            print(f"Error processing file {file}: {e}")

# Concatenate all DataFrames into one
if dataframes:  # Check if any DataFrames were added
    main = pd.concat(dataframes, ignore_index=True)
    print(f"Number of rows processed: {rows}")
    print(f"Length of the main DataFrame: {len(main)}")
    main.to_csv("wine_df.csv", index=False)
else:
    print("Error: No valid CSV files found in the directory. Please check the files in './Wine_Stats/'.")


Number of rows processed: 5145
Length of the main DataFrame: 5145


In [6]:
# Initial data exploration and cleaning
if 'main' in locals() or 'main' in globals():
    print("DataFrame Info:")
    print(main.info())  # Shows data types, non-null counts, and memory usage

    # Remove rows with null values
    main.dropna(inplace=True)

    # Remove duplicate rows
    main.drop_duplicates(inplace=True)

    # Save the cleaned DataFrame to a CSV file
    main.to_csv("project_cleaned.csv", index=False)
    print("Cleaned DataFrame saved to 'project_cleaned.csv'")
else:
    print("Error: 'main' DataFrame is not defined. Please ensure the DataFrame is created before running this code.")

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5145 entries, 0 to 5144
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         5145 non-null   int64  
 1   Name               5145 non-null   object 
 2   Rating             5145 non-null   float64
 3   Number of Ratings  5145 non-null   int64  
 4   Price              5145 non-null   float64
 5   Region             5145 non-null   object 
 6   Winery             5142 non-null   object 
 7   Wine style         4624 non-null   object 
 8   Alcohol content    5145 non-null   float64
 9   Grapes             2977 non-null   object 
 10  Food pairings      5145 non-null   object 
 11  Bold               5145 non-null   float64
 12  Tannin             5145 non-null   float64
 13  Sweet              5145 non-null   float64
 14  Acidic             5145 non-null   float64
dtypes: float64(7), int64(2), object(6)
memory usage: 603.1+ 