In [None]:
#!pip install polars
%pip install pandas_profiling
%pip install pydantic
import os
import json
import polars as pl 
import pandas as pd
import pandas_profiling
import geopandas as gpd
from ydata_profiling import ProfileReport
import numpy as np

In [2]:

# Installs the Polars library for working with tabular data
# Imports libraries for working with different data file types and generating data profiling reports
# Loops through all files in the input directory
# Determines file type and reads into appropriate DataFrame 
# Generates a ProfileReport for key info like data types, missing values
# Extracts schema info, missing data stats, and other useful info
# Saves results to a JSON file for later analysis
#%pip install polars
#%pip install pandas_profiling
#%pip install pydantic
#import os
#import json
#import polars as pl 
#import pandas as pd
#import pandas_profiling
#import geopandas as gpd
#from ydata_profiling import ProfileReport
#import numpy as np

# Gather schema information
inspection_results = []

directory = r"C:\Users\toast\Downloads\Climate_Data\Tabular\test"
output_folder = r"C:\Users\toast\Downloads\Climate_Data\Tabular\test\profiles"
# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

for file in os.listdir(directory):
    if file.endswith((".csv", ".xlsx", ".geojson", ".shp")):
        file_path = os.path.join(directory, file)
        print(f"Examining file: {file}")
        
        # Determine the file format and use appropriate library
        if file.endswith((".csv", ".xlsx")):
            try:
                if file.endswith(".csv"):
                    df = pd.read_csv(file_path)
                else:
                    df = pd.read_excel(file_path)
                
                # Generate a data profiling report
                profile = ProfileReport(df)
                # Save the profile report with the correct file extension
                output_file_name = f"profile_{os.path.splitext(file)[0]}.html"  # Corrected the extension
                profile.to_file(os.path.join(output_folder, output_file_name))
                
                # Gather schema information
                schema_info = df.dtypes.to_dict()
                
                # Check for missing data
                missing_data = df.isnull().sum().to_dict()
                
                # Extract useful information (customize as needed)
                useful_info = {
                    "Total Rows": len(df),
                    "Total Columns": len(df.columns),
                    "Column Names": df.columns.tolist(),
                    "Profile Report": output_file_name                 ,
                    "Data Types": schema_info,
                    "Missing Data": missing_data,

                    # Add more information as needed
                }
                
            except Exception as e:
                # Handle any errors when reading the file
                schema_info = {}
                missing_data = {}
                useful_info = {"Error": str(e)}
        
        elif file.endswith((".geojson", ".shp")):
            try:
                if file.endswith(".geojson"):
                    df = gpd.read_file(file_path)
                else:
                    df = gpd.read_file(file_path)
                
                # Gather schema information
                schema_info = dict(df.dtypes)
                
                # Check for missing data
                missing_data = df.isnull().sum().to_dict()
                
                # Extract useful information (customize as needed)
                useful_info = {
                    "Total Rows": len(df),
                    "Total Columns": len(df.columns),
                    "Column Names": df.columns.tolist(),
                    "Data Types": schema_info,
                    "Missing Data": missing_data,

                    # Add more information as needed
                }
            except Exception as e:
                # Handle any errors when reading the file
                schema_info = {}
                missing_data = {}
                useful_info = {"Error": str(e)}
        
        # Store the inspection results in a dictionary
        result = {
            "File Name": file,
            "Missing Data": missing_data,
            "Schema Info": schema_info,
            "Useful Info": useful_info,
        }
        
        # Append the result to the list
        inspection_results.append(result)
        print("Inspection complete.")
        print("-----")

        # Save the profile report to disk
        if "Profile Report" in useful_info:
            profile_path = useful_info["Profile Report"]
            if os.path.exists(profile_path):
                os.remove(profile_path)
            if os.path.exists(f"profile_{file}.html"):
                if os.path.exists(profile_path):
                    os.remove(profile_path)
                os.rename(f"profile_{file}.html", profile_path)

# Save the inspection results to a JSON file
output_file = r"C:\Users\toast\Downloads\Climate_Data\inspection_results.json"

def convert_dtype(obj):
    if isinstance(obj, pd.Timestamp):
        return obj.isoformat()
    else:
        return str(obj)

with open(output_file, "w") as json_file:
    json.dump(inspection_results, json_file, default=convert_dtype)

print(f"Inspection results saved to {output_file}")



Examining file: Neighbourhoods_and_Wards_20231017.csv


Summarize dataset:  11%|█         | 1/9 [00:00<00:01,  5.98it/s, Describe variable:Neighbourhood Number]     

Summarize dataset: 100%|██████████| 17/17 [00:10<00:00,  1.62it/s, Completed]                                                   
Generate report structure: 100%|██████████| 1/1 [00:04<00:00,  4.22s/it]
Render HTML: 100%|██████████| 1/1 [00:02<00:00,  2.50s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 85.84it/s]


Inspection complete.
-----
Examining file: Root_for_Trees_Inventory_20231022.csv


Summarize dataset: 100%|██████████| 46/46 [00:14<00:00,  3.10it/s, Completed]                                         
Generate report structure: 100%|██████████| 1/1 [00:12<00:00, 12.94s/it]
Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.04s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 15.87it/s]


Inspection complete.
-----
Examining file: Tree_Insects___Other_Pests_20231017.csv


Summarize dataset: 100%|██████████| 22/22 [00:04<00:00,  5.39it/s, Completed]                                     
Generate report structure: 100%|██████████| 1/1 [00:09<00:00,  9.41s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.41s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 55.56it/s]


Inspection complete.
-----
Examining file: Vacant_Land_-_Industrial_20231017.csv


Summarize dataset: 100%|██████████| 49/49 [00:14<00:00,  3.31it/s, Completed]                                         
Generate report structure: 100%|██████████| 1/1 [00:12<00:00, 12.81s/it]
Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.66s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 32.30it/s]

Inspection complete.
-----
Inspection results saved to C:\Users\toast\Downloads\Climate_Data\inspection_results.json





In [21]:
report = pd.DataFrame(inspection_results)

In [22]:
report.to_csv(r"C:\Users\toast\Downloads\Climate_Data\inspection_results.csv")