In [114]:
# import required libs
import os
# import magic
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt


In [115]:
def check_file_existance(file_path):
    is_file_exist = False
    
    if os.path.exists(file_path):
       is_file_exist = True
    else:
        is_file_exist = False
        print(f'File in your path ({file_path}) does not exist.')
        
    return is_file_exist

In [116]:
def get_file_type(file_path):
    file_type = None
    
    # mime = magic.Magic(mime=True)
    # file_type = mime.from_file(file_path)
    file_type = os.path.splitext(file_path)[1].lower()
    
    print(file_type)
    return file_type

In [117]:
def read_data(file_path: str, file_type: str):
    data = None
    is_file_read = False
    
    if file_type == '.csv':
        data = pd.read_csv(file_path)
    elif file_type == '.xlsx':
        data = pd.read_excel(file_path)
    else: 
        print(f'Couldn\'t read file with type ({file_type}) only CSV and Excel can be read.' )
    
    print(data)
    return data

In [118]:
def preprocess_data(data):
    # handle missing values based on data types
       
    # step(1) get columns names of (categorical, numerical) types separitlly
    categorical_cols = data.select_dtypes(include=['object']).columns
    numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
    
    # step(2) handle messing values
    categorical_data = data[categorical_cols].fillna('Unknown')
    numerical_data = data[numerical_cols].fillna(data[numerical_cols].median())
    
    # step(3) encode categorical features and scale numerical features
    categorical_data = pd.get_dummies(categorical_data, columns=categorical_cols, drop_first=True)
    
    # step(4) scaling numerical data
    numerical_data = (numerical_data - numerical_data.mean()) / numerical_data.std()
    
    
    return pd.concat([categorical_data, numerical_data], axis=1)

In [119]:
def get_plots(data):
    all_plots = []
    
    data_cols = data.columns.values
    for col in data_cols:
        col_type = data[col].dtype
        if col_type == 'object':
            bar_plot = px.bar(data[col].value_counts(), title=f'{col} Distribution')
            pie_plot = px.pie(data, names=col, title=f'{col} Pie Chart')
            all_plots.extend([bar_plot, pie_plot])
        elif col_type in ['int64', 'float64']:
            hist_plot = px.histogram(data, x=col, title=f'{col} Histogram')
            box_plot = px.box(data, y=col, title=f'{col} Box Plot')
            all_plots.extend([hist_plot, box_plot])
        elif col_type == 'datetime64':
            time_series_plot = px.line(data, x=col, title=f'{col} Time Series')
            all_plots.append(time_series_plot)
        else:
            print(f'Column called {col} with type ({col_type}) could not be represented')
    
    return all_plots

In [120]:
def visualize_data(all_plots):
    for plot in all_plots:
        plot.show()

In [121]:
is_existing_file = False
file_path = None
file_type = None
data = None

while not is_existing_file:
    file_path = input('Please enter your data file path: ')
    if check_file_existance(file_path):
        file_type = get_file_type(file_path)
        data = read_data(file_path, file_type)
        data = preprocess_data(data)
        plots = get_plots(data)
        visualize_data(plots)

.csv
          data_dte  Year  Month  usg_apt_id usg_apt  usg_wac  fg_apt_id   
0       05/01/2006  2006      5       12016     GUM        5      13162  \
1       05/01/2003  2003      5       10299     ANC        1      13856   
2       03/01/2007  2007      3       10721     BOS       13      12651   
3       12/01/2004  2004     12       11259     DAL       74      16271   
4       05/01/2009  2009      5       13303     MIA       33      11075   
...            ...   ...    ...         ...     ...      ...        ...   
930803  07/01/2000  2000      7       13303     MIA       33      13605   
930804  04/01/2019  2019      4       13303     MIA       33      14286   
930805  08/01/2000  2000      8       13303     MIA       33      13605   
930806  09/01/2004  2004      9       12266     IAH       74      15632   
930807  05/01/1996  1996      5       12478     JFK       22      14210   

       fg_apt  fg_wac  airlineid carrier  carriergroup        type  Scheduled   
0         MAJ

NameError: name 'column_name' is not defined