In [43]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from pathlib import Path

### Loading the data

In [44]:
def import_data(file_path , file_type='auto'):
	try:
		if file_type == 'csv' or file_path.endswith('.csv'):
			return pd.read_csv(file_path)
		else:
			return pd.read_excel(file_path)
	except Exception as e:
		print(f"Error importing {file_path}: {e}")
		raise e
try:
	ad_category_df = pd.read_excel(r'C:\Users\GUMMUDU HEMANTH\Bharat-Herald-Analytics\data_raw\dim_ad_category.xlsx')
	city_df = pd.read_excel(r'C:\Users\GUMMUDU HEMANTH\Bharat-Herald-Analytics\data_raw\dim_city.xlsx')
	ad_revenue_df = pd.read_csv(r'C:\Users\GUMMUDU HEMANTH\Bharat-Herald-Analytics\data_raw\fact_ad_revenue.csv')
	city_readiness_df = pd.read_csv(r'C:\Users\GUMMUDU HEMANTH\Bharat-Herald-Analytics\data_raw\fact_city_readiness.csv')
	digital_plot_df = pd.read_csv(r'C:\Users\GUMMUDU HEMANTH\Bharat-Herald-Analytics\data_raw\fact_digital_pilot.csv')
	print_sales_df = pd.read_excel(r'C:\Users\GUMMUDU HEMANTH\Bharat-Herald-Analytics\data_raw\fact_print_sales.xlsx')

	print("All datasets imported successfully.")

except Exception as e:
	print(f"An error occurred: {e}")
	raise e

All datasets imported successfully.


### Data preprocessing


In [45]:
def data_preprocessing(df, df_name):
	print(f"Preprocessing {df_name}")
	print("\n" + "-"*50 + "\n")

	# Check missing values
	missing_values = df.isnull().sum()
	if missing_values.any():
		print(f"Missing values in {df_name}:{missing_values[missing_values > 0]}")
	else:
		print(f"No missing values in {df_name}.")

	# Check for duplicates
	duplicate_values = df.duplicated().sum()
	if duplicate_values > 0:
		print(f"Found {duplicate_values} duplicate rows in {df_name}.")
	else:
		print(f"No duplicates rows in {df_name}.")

	# Check size of the dataframe
	print(f"{df_name} shape: {df.shape}")

	# Check columns and its data types
	print(f"{df_name} columns and data types:\n{df.dtypes}")

	# Basic statistics for numerical columns
	print(f"Basic statistics for {df_name}:\n{df.describe(include='all')}")
	print("\n" + "="*100 + "\n")


# Preprocess each dataframe
data_preprocessing(ad_category_df, 'ad_category_df')
data_preprocessing(city_df, 'city_df')
data_preprocessing(ad_revenue_df, 'ad_revenue_df')
data_preprocessing(city_readiness_df, 'city_readiness_df')
data_preprocessing(digital_plot_df, 'digital_plot_df')
data_preprocessing(print_sales_df, 'print_sales_df')


Preprocessing ad_category_df

--------------------------------------------------

No missing values in ad_category_df.
No duplicates rows in ad_category_df.
ad_category_df shape: (4, 4)
ad_category_df columns and data types:
ad_category_id          object
standard_ad_category    object
category_group          object
example_brands          object
dtype: object
Basic statistics for ad_category_df:
       ad_category_id standard_ad_category     category_group example_brands
count               4                    4                  4              4
unique              4                    4                  3              4
top              A001           Government  Commercial Brands       LIC, SBI
freq                1                    1                  2              1


Preprocessing city_df

--------------------------------------------------

No missing values in city_df.
No duplicates rows in city_df.
city_df shape: (10, 4)
city_df columns and data types:
city_id    object
city

In [46]:
digital_plot_df.head()

Unnamed: 0.1,Unnamed: 0,platform,launch_month,ad_category_id,dev_cost,marketing_cost,users_reached,downloads_or_accesses,avg_bounce_rate,cumulative_feedback_from_customers,city_id
0,0,PDF WhatsApp Push,2021-01,A001,236570,66060,23509,16319,52.55,"Mixed feedback: some usability concerns, but h...",C001
1,1,PDF WhatsApp Push,2021-02,A001,156865,99122,19472,17017,82.53,"Mixed feedback: some usability concerns, but h...",C002
2,2,PDF WhatsApp Push,2021-03,A001,242728,46087,8471,2891,68.06,"Mixed feedback: some usability concerns, but h...",C003
3,3,PDF WhatsApp Push,2021-04,A001,147695,78868,46796,15640,66.17,"Mixed feedback: some usability concerns, but h...",C004
4,4,PDF WhatsApp Push,2021-05,A001,325906,135644,16805,3231,76.9,The site takes too long to load on average pho...,C005


### Handling Missing Values

In [51]:
def handling_missing_values(ad_revenue_df , digital_plot_df):
	# Handling missing values in ad_revenue_df
	ad_revenue_df['comments'] = ad_revenue_df['comments'].fillna('No Commensts')

	# Handling missing values in digital_plot_df
	digital_plot_df['cumulative_feedback_from_customers'] = digital_plot_df['cumulative_feedback_from_customers'].fillna(method='ffill')

	return ad_revenue_df , digital_plot_df

ad_revenue_df , digital_plot_df = handling_missing_values(ad_revenue_df , digital_plot_df)
