In [66]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from pathlib import Path
import re

### Loading the data

In [67]:
def import_data(file_path , file_type='auto'):
	try:
		if file_type == 'csv' or file_path.endswith('.csv'):
			return pd.read_csv(file_path)
		else:
			return pd.read_excel(file_path)
	except Exception as e:
		print(f"Error importing {file_path}: {e}")
		raise e
try:
	ad_category_df = pd.read_excel(r'C:\Users\GUMMUDU HEMANTH\Bharat-Herald-Analytics\data_raw\dim_ad_category.xlsx')
	city_df = pd.read_excel(r'C:\Users\GUMMUDU HEMANTH\Bharat-Herald-Analytics\data_raw\dim_city.xlsx')
	ad_revenue_df = pd.read_csv(r'C:\Users\GUMMUDU HEMANTH\Bharat-Herald-Analytics\data_raw\fact_ad_revenue.csv')
	city_readiness_df = pd.read_csv(r'C:\Users\GUMMUDU HEMANTH\Bharat-Herald-Analytics\data_raw\fact_city_readiness.csv')
	digital_plot_df = pd.read_csv(r'C:\Users\GUMMUDU HEMANTH\Bharat-Herald-Analytics\data_raw\fact_digital_pilot.csv')
	print_sales_df = pd.read_excel(r'C:\Users\GUMMUDU HEMANTH\Bharat-Herald-Analytics\data_raw\fact_print_sales.xlsx')

	print("All datasets imported successfully.")

except Exception as e:
	print(f"An error occurred: {e}")
	raise e

All datasets imported successfully.


### Data preprocessing


In [68]:
def data_preprocessing(df, df_name):
	print(f"Preprocessing {df_name}")
	print("\n" + "-"*50 + "\n")

	# Check missing values
	missing_values = df.isnull().sum()
	if missing_values.any():
		print(f"Missing values in {df_name}:{missing_values[missing_values > 0]}")
	else:
		print(f"No missing values in {df_name}.")

	# Check for duplicates
	duplicate_values = df.duplicated().sum()
	if duplicate_values > 0:
		print(f"Found {duplicate_values} duplicate rows in {df_name}.")
	else:
		print(f"No duplicates rows in {df_name}.")

	# Check size of the dataframe
	print(f"{df_name} shape: {df.shape}")

	# Check columns and its data types
	print(f"{df_name} columns and data types:\n{df.dtypes}")

	# Basic statistics for numerical columns
	print(f"Basic statistics for {df_name}:\n{df.describe(include='all')}")
	print("\n" + "="*100 + "\n")


# Preprocess each dataframe
data_preprocessing(ad_category_df, 'ad_category_df')
data_preprocessing(city_df, 'city_df')
data_preprocessing(ad_revenue_df, 'ad_revenue_df')
data_preprocessing(city_readiness_df, 'city_readiness_df')
data_preprocessing(digital_plot_df, 'digital_plot_df')
data_preprocessing(print_sales_df, 'print_sales_df')


Preprocessing ad_category_df

--------------------------------------------------

No missing values in ad_category_df.
No duplicates rows in ad_category_df.
ad_category_df shape: (4, 4)
ad_category_df columns and data types:
ad_category_id          object
standard_ad_category    object
category_group          object
example_brands          object
dtype: object
Basic statistics for ad_category_df:
       ad_category_id standard_ad_category     category_group example_brands
count               4                    4                  4              4
unique              4                    4                  3              4
top              A001           Government  Commercial Brands       LIC, SBI
freq                1                    1                  2              1


Preprocessing city_df

--------------------------------------------------

No missing values in city_df.
No duplicates rows in city_df.
city_df shape: (10, 4)
city_df columns and data types:
city_id    object
city

### Handling Missing Values

In [69]:
def handling_missing_values(ad_revenue_df , digital_plot_df):
	# Handling missing values in ad_revenue_df
	ad_revenue_df['comments'] = ad_revenue_df['comments'].fillna('No Commensts')

	# Handling missing values in digital_plot_df
	digital_plot_df['cumulative_feedback_from_customers'] = digital_plot_df['cumulative_feedback_from_customers'].fillna(method='ffill')

	return ad_revenue_df , digital_plot_df

ad_revenue_df , digital_plot_df = handling_missing_values(ad_revenue_df , digital_plot_df)


### Handling Inconsistencies in Ad_Revenue_Df Table

In [70]:
ad_revenue_df.sample(10)

Unnamed: 0,edition_id,ad_category,quarter,ad_revenue,currency,comments
259,ED1006,A002,Q3-2020,15537.52,EUR,No Commensts
351,ED1001,A003,Q3-2020,3607897.0,IN RUPEES,No Commensts
442,ED1007,A002,Q1-2023,1421291.0,IN RUPEES,No Commensts
260,ED1005,A003,Q3-2022,2724025.0,INR,No Commensts
239,ED1010,A004,Q1-2019,1241017.0,INR,No Commensts
463,ED1009,A003,2022-Q2,1813467.0,INR,No Commensts
690,ED1001,A004,4th Qtr 2020,1635330.0,INR,No Commensts
534,ED1007,A003,Q3-2019,1040707.0,INR,No Commensts
530,ED1006,A003,2022-Q2,2973670.0,INR,No Commensts
309,ED1006,A004,2023-Q2,38727.36,USD,No Commensts


In [71]:
def preprocess_ad_revenue(ad_revenue_df):
	# Create a copy
	ad_revenue_df_copy = ad_revenue_df.copy()

	# Standardize Quarter Column
	ad_revenue_df_copy['year'] = ad_revenue_df_copy['quarter'].str.extract(r'(\d{4})')
	ad_revenue_df_copy['quarter_num'] = ad_revenue_df_copy['quarter'].str.extract(r'([Qq]\d|Qtr\d|[1-4]th\s*Qtr)', flags=re.IGNORECASE)
	ad_revenue_df_copy['quarter_num'] = ad_revenue_df_copy['quarter_num'].str.replace('[^0-9]', '', regex=True)

	# Standardize ad_revenue and currency column
	usd_to_inr = 88.14
	eur_to_inr = 103.39

	ad_revenue_df_copy['currency'] = ad_revenue_df_copy['currency'].str.upper().str.strip()

	# Conditions for each currency type
	conditions = [
		ad_revenue_df_copy['currency'].isin(['INR' , 'IN RUPEES']),
		ad_revenue_df_copy['currency'] == 'USD',
		ad_revenue_df_copy['currency'] == 'EUR'
	]

	# Corresponding choices for each condition
	choices = [
		ad_revenue_df_copy['ad_revenue'],
		ad_revenue_df_copy['ad_revenue'] * usd_to_inr,
		ad_revenue_df_copy['ad_revenue'] * eur_to_inr
	]

	ad_revenue_df_copy['ad_revenue_inr'] = np.select(conditions, choices, default=np.nan)
	ad_revenue_df_copy['ad_revenue_inr'] = ad_revenue_df_copy['ad_revenue_inr'].round(2)
	return ad_revenue_df_copy

ad_revenue_df = preprocess_ad_revenue(ad_revenue_df)


In [72]:
ad_revenue_df.head()

Unnamed: 0,edition_id,ad_category,quarter,ad_revenue,currency,comments,year,quarter_num,ad_revenue_inr
0,ED1005,A001,2023-Q2,22613.69,EUR,No Commensts,2023,2,2338029.41
1,ED1005,A002,Q1-2019,39366.88,USD,No Commensts,2019,1,3469796.8
2,ED1001,A003,Q3-2023,3709860.0,INR,No Commensts,2023,3,3709860.0
3,ED1003,A002,Q3-2023,40969.55,USD,No Commensts,2023,3,3611056.14
4,ED1007,A003,4th Qtr 2020,51779.4,USD,No Commensts,2020,4,4563836.32


### Handling Inconsistencies in Digital Plot table

In [73]:
digital_plot_df.sample(10)

Unnamed: 0.1,Unnamed: 0,platform,launch_month,ad_category_id,dev_cost,marketing_cost,users_reached,downloads_or_accesses,avg_bounce_rate,cumulative_feedback_from_customers,city_id
31,31,Mobile App Beta,2021-08,A002,193175,58854,24715,8189,52.93,"Mixed feedback: some usability concerns, but h...",C002
28,28,Mobile App Beta,2021-05,A002,89596,40258,30279,26586,42.6,"Mixed feedback: some usability concerns, but h...",C009
3,3,PDF WhatsApp Push,2021-04,A001,147695,78868,46796,15640,66.17,"Mixed feedback: some usability concerns, but h...",C004
26,26,Mobile App Beta,2021-03,A002,162617,43428,47963,15016,80.32,"Mixed feedback: some usability concerns, but h...",C007
17,17,E-paper Mobile Web,2021-06,A003,338339,82663,26785,4784,77.8,Many said font was too tiny to read.\nZooming ...,C008
42,42,Responsive Web Version,2021-07,A004,184195,89792,22069,19491,62.56,"Mixed feedback: some usability concerns, but h...",C003
7,7,PDF WhatsApp Push,2021-08,A001,188777,66727,39865,17866,76.14,"Mixed feedback: some usability concerns, but h...",C008
11,11,PDF WhatsApp Push,2021-12,A001,106832,44226,37373,28240,44.09,"Mixed feedback: some usability concerns, but h...",C002
34,34,Mobile App Beta,2021-11,A002,153445,31651,10105,7462,77.84,"Mixed feedback: some usability concerns, but h...",C005
16,16,E-paper Mobile Web,2021-05,A003,169464,35997,28284,9055,82.5,"Mixed feedback: some usability concerns, but h...",C007


In [74]:
def preprocess_digital_plot(digital_plot_df):
	# Create a copy
	digital_plot_df_copy = digital_plot_df.copy()

	# Convert 'launch_month' to datetime
	digital_plot_df_copy['launch_month'] = pd.to_datetime(digital_plot_df_copy['launch_month'], errors='coerce')

	# Extract year , month , month_name and quarter
	digital_plot_df_copy['launch_year'] = digital_plot_df_copy['launch_month'].dt.year
	digital_plot_df_copy['launch_month_num'] = digital_plot_df_copy['launch_month'].dt.month
	digital_plot_df_copy['launch_month_name'] = digital_plot_df_copy['launch_month'].dt.month_name()
	digital_plot_df_copy['launch_quarter'] = digital_plot_df_copy['launch_month'].dt.quarter

	return digital_plot_df_copy

digital_plot_df_test = preprocess_digital_plot(digital_plot_df)

In [75]:
digital_plot_df_test.sample(10)

Unnamed: 0.1,Unnamed: 0,platform,launch_month,ad_category_id,dev_cost,marketing_cost,users_reached,downloads_or_accesses,avg_bounce_rate,cumulative_feedback_from_customers,city_id,launch_year,launch_month_num,launch_month_name,launch_quarter
2,2,PDF WhatsApp Push,2021-03-01,A001,242728,46087,8471,2891,68.06,"Mixed feedback: some usability concerns, but h...",C003,2021,3,March,1
42,42,Responsive Web Version,2021-07-01,A004,184195,89792,22069,19491,62.56,"Mixed feedback: some usability concerns, but h...",C003,2021,7,July,3
28,28,Mobile App Beta,2021-05-01,A002,89596,40258,30279,26586,42.6,"Mixed feedback: some usability concerns, but h...",C009,2021,5,May,2
18,18,E-paper Mobile Web,2021-07-01,A003,226451,85506,22833,7708,73.45,"Mixed feedback: some usability concerns, but h...",C009,2021,7,July,3
1,1,PDF WhatsApp Push,2021-02-01,A001,156865,99122,19472,17017,82.53,"Mixed feedback: some usability concerns, but h...",C002,2021,2,February,1
44,44,Responsive Web Version,2021-09-01,A004,248876,64051,42051,21433,78.6,"Mixed feedback: some usability concerns, but h...",C005,2021,9,September,3
12,12,E-paper Mobile Web,2021-01-01,A003,195725,93646,40525,23092,62.92,"Mixed feedback: some usability concerns, but h...",C003,2021,1,January,1
39,39,Responsive Web Version,2021-04-01,A004,262756,167913,16821,5065,81.3,"Mixed feedback: some usability concerns, but h...",C010,2021,4,April,2
37,37,Responsive Web Version,2021-02-01,A004,199654,57386,8384,5805,83.89,"Mixed feedback: some usability concerns, but h...",C008,2021,2,February,1
9,9,PDF WhatsApp Push,2021-10-01,A001,221877,95697,47845,21517,75.39,"Mixed feedback: some usability concerns, but h...",C010,2021,10,October,4


### Handling Inconsistencies in City Readiness Table

In [76]:
city_readiness_df.sample(10)

Unnamed: 0.1,Unnamed: 0,city_id,quarter,literacy_rate,smartphone_penetration,internet_penetration
35,35,C002,2021-Q4,71.13,48.21,50.41
43,43,C002,2023-Q4,70.58,47.84,47.76
192,192,C009,2019-Q1,75.15,67.85,76.09
96,96,C005,2019-Q1,84.85,70.31,10.0
64,64,C003,2023-Q1,82.81,72.07,66.71
236,236,C010,2024-Q1,70.77,77.29,74.3
219,219,C010,2019-Q4,70.59,76.21,75.06
112,112,C005,2023-Q1,84.6,70.42,10.0
153,153,C007,2021-Q2,66.24,77.1,63.71
176,176,C008,2021-Q1,71.19,79.99,74.27


In [77]:
def preprocess_city_readiness(city_readiness_df):
	# Create a copy
	city_readiness_df_copy = city_readiness_df.copy()

	# Extract year and quarter_num from Quarter Column
	city_readiness_df_copy['year'] = city_readiness_df_copy['quarter'].str.extract(r'(\d{4})')
	city_readiness_df_copy['quarter_num'] = city_readiness_df_copy['quarter'].str.extract(r'([Qq]\d|Qtr\d|[1-4]th\s*Qtr)', flags=re.IGNORECASE)
	city_readiness_df_copy['quarter_num'] = city_readiness_df_copy['quarter_num'].str.replace('[^0-9]', '', regex=True)

	return city_readiness_df_copy

city_readiness_df = preprocess_city_readiness(city_readiness_df)


In [78]:
city_readiness_df.sample(10)

Unnamed: 0.1,Unnamed: 0,city_id,quarter,literacy_rate,smartphone_penetration,internet_penetration,year,quarter_num
57,57,C003,2021-Q2,82.65,69.86,66.96,2021,2
71,71,C003,2024-Q4,82.74,68.67,67.15,2024,4
223,223,C010,2020-Q4,70.68,75.74,73.71,2020,4
62,62,C003,2022-Q3,82.78,70.32,67.08,2022,3
165,165,C007,2024-Q2,66.76,75.45,62.79,2024,2
10,10,C001,2021-Q3,89.14,75.25,55.16,2021,3
90,90,C004,2023-Q3,75.6,67.46,67.99,2023,3
229,229,C010,2022-Q2,70.75,76.34,74.27,2022,2
219,219,C010,2019-Q4,70.59,76.21,75.06,2019,4
59,59,C003,2021-Q4,82.82,70.76,66.48,2021,4


In [79]:
city_readiness_df.isna().sum()

Unnamed: 0                0
city_id                   0
quarter                   0
literacy_rate             0
smartphone_penetration    0
internet_penetration      0
year                      0
quarter_num               0
dtype: int64

### Handling Inconsistencies in Print Sales Table


In [80]:
print_sales_df.head()

Unnamed: 0,edition_ID,City_ID,Language,State,Month,Copies Sold,copies_returned,Net_Circulation
0,ED1005,C005,Hindi,Rajasthan,2023-05-01 00:00:00,404389,13510,390879
1,ED1005,C005,Hindi,Rajasthan,2019-03-01 00:00:00,492943,25024,467919
2,ED1001,C001,hindi,Uttar pradesh,2023-07-01 00:00:00,168893,12285,156608
3,ED1003,C003,Hindi,Madhya_Pradesh,2023-07-01 00:00:00,216540,10117,206423
4,ED1007,C007,Hindi,Jharkhand,2020-10-01 00:00:00,234563,13048,221515


In [81]:
print_sales_df['State'].unique()

array(['Rajasthan', 'Uttar pradesh', 'Madhya_Pradesh', 'Jharkhand',
       'maharashtra', 'Uttar-Pradesh', 'Delhi', 'gujarat', 'bihar',
       'Uttar Pradesh'], dtype=object)

In [82]:
def preprocess_print_sales(print_sales_df):
	# Create a copy
	print_sales_df_copy = print_sales_df.copy()

	print_sales_df_copy['State'] = print_sales_df_copy['State'].str.replace(r'Uttar-Pradesh|Uttar pradesh', 'Uttar Pradesh', regex=True)
	print_sales_df_copy.rename(columns={'Copies Sold': 'Copies_Sold'}, inplace=True)
	# Convert Month to datetime
	print_sales_df_copy['Month'] = pd.to_datetime(print_sales_df_copy['Month'], errors='coerce')

	# Extract year , month , month_name and quarter
	print_sales_df_copy['year'] = print_sales_df_copy['Month'].dt.year
	print_sales_df_copy['month_num'] = print_sales_df_copy['Month'].dt.month
	print_sales_df_copy['month_name'] = print_sales_df_copy['Month'].dt.month_name()
	print_sales_df_copy['quarter'] = print_sales_df_copy['Month'].dt.quarter

	return print_sales_df_copy

print_sales_df = preprocess_print_sales(print_sales_df)

In [83]:
print_sales_df.sample(10)

Unnamed: 0,edition_ID,City_ID,Language,State,Month,Copies_Sold,copies_returned,Net_Circulation,year,month_num,month_name,quarter
627,ED1010,C010,Hindi,Uttar Pradesh,2023-04-01,400526,25275,375251,2023,4,April,2
494,ED1001,C001,hindi,Uttar Pradesh,2023-12-01,167015,9768,157247,2023,12,December,4
173,ED1007,C007,Hindi,Jharkhand,2024-08-01,192417,7528,184889,2024,8,August,3
600,ED1009,C009,Hindi,gujarat,2021-09-01,284447,21726,262721,2021,9,September,3
34,ED1008,C008,Hindi,Uttar Pradesh,2024-08-01,283139,21735,261404,2024,8,August,3
148,ED1001,C001,hindi,Uttar Pradesh,2023-09-01,163687,5466,158221,2023,9,September,3
324,ED1008,C008,Hindi,Uttar Pradesh,2021-10-01,332029,20139,311890,2021,10,October,4
616,ED1007,C007,Hindi,Jharkhand,2023-10-01,194458,7833,186625,2023,10,October,4
680,ED1008,C008,Hindi,Uttar Pradesh,2024-03-01,â‚¹284706,20245,264461,2024,3,March,1
651,ED1008,C008,Hindi,Uttar Pradesh,2024-02-01,284638,12119,272519,2024,2,February,1


### Storing the cleaned datasets for futher analysis.

In [84]:
# Path of the current directory
current_dir = Path.cwd()

# Move to Bharat-Herald-Analytics directory
main_dir = current_dir.parent

# Define paths for cleaned data
cleaned_data_dir = main_dir / 'data_cleaned'

# Create the directory if it doesn't exist
cleaned_data_dir.mkdir(parents=True, exist_ok=True)

print("Directory for cleaned data is set up at:", cleaned_data_dir)


Directory for cleaned data is set up at: c:\Users\GUMMUDU HEMANTH\Bharat-Herald-Analytics\data_cleaned


In [85]:
city_df.to_excel(cleaned_data_dir / 'cleaned_city_data.xlsx', index=False)
ad_revenue_df.to_csv(cleaned_data_dir / 'cleaned_ad_revenue.csv', index=False)
city_readiness_df.to_csv(cleaned_data_dir / 'cleaned_city_readiness.csv', index=False)
digital_plot_df.to_csv(cleaned_data_dir / 'cleaned_digital_plot.csv', index=False)
print_sales_df.to_excel(cleaned_data_dir / 'cleaned_print_sales.xlsx', index=False)
ad_category_df.to_excel(cleaned_data_dir / 'cleaned_ad_category.xlsx', index=False)
print("Cleaned datasets have been saved successfully.")

Cleaned datasets have been saved successfully.
