<a href="https://colab.research.google.com/github/N0VA-code/Week6/blob/main/Week6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ray
!pip install modin[ray]
!pip install pandas
!pip install dask
!pip install pyyaml

import pandas as pd
import dask.dataframe as dd
import modin.pandas as mp
import ray
import yaml
import os
import time
import gzip
from google.colab import drive

drive.mount('/content/drive')

file_path = '/content/drive/My Drive/unbalaced_20_80_dataset.csv'

def read_with_pandas(file_path):
    start_time = time.time()
    df_pandas = pd.read_csv(file_path)
    elapsed_time = time.time() - start_time
    print("Pandas Time:", elapsed_time)
    return df_pandas

def read_with_dask(file_path):
    start_time = time.time()
    df_dask = dd.read_csv(file_path)
    elapsed_time = time.time() - start_time
    print("Dask Time:", elapsed_time)
    return df_dask

ray.shutdown()
ray.init(ignore_reinit_error=True)

def read_with_modin(file_path):
    start_time = time.time()
    df_modin = mp.read_csv(file_path)
    elapsed_time = time.time() - start_time
    print("Modin Time:", elapsed_time)
    return df_modin

def validate_columns(df):
    df.columns = df.columns.str.replace(' ', '_').str.replace('[^\\w\\s]', '')
    return df

def create_yaml(df, yaml_path):
    schema = {'columns': df.columns.tolist()}
    with open(yaml_path, 'w') as file:
        yaml.dump(schema, file, default_flow_style=False)

def validate_with_yaml(df, yaml_path):
    with open(yaml_path, 'r') as file:
        schema = yaml.safe_load(file)
    assert schema['columns'] == df.columns.tolist(), "Columns do not match with YAML file."

def write_and_summarize(df, output_path):
    df.to_csv(output_path, sep='|', index=False, compression='gzip')
    summary = {
        'rows': len(df),
        'columns': len(df.columns),
        'size': os.path.getsize(output_path)
    }
    return summary

yaml_path = '/content/schema.yaml'
output_path = '/content/output_file.gz'

df = read_with_pandas(file_path)
df = validate_columns(df)

create_yaml(df, yaml_path)
validate_with_yaml(df, yaml_path)

summary = write_and_summarize(df, output_path)

print(summary)
