In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import sqlite3 as sql
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [None]:
# Establish connection to the database
db_path = r"C:\Users\user\Desktop\3Signet Project2 Task\ABCPharmData.db"
conn = sql.connect(db_path)

In [None]:
# Load data into a DataFrame
query = "SELECT * FROM ABCData"
df = pd.read_sql(query, conn)
df.head()

### Data Preprocessing and Validation

In [None]:
df.info()

In [None]:
#Converting Quantity and Sales column to Integer type

df['Quantity'] = df['Quantity'].astype(int)
df['Sales'] = df['Sales'].astype(int)
print(df.dtypes)

In [None]:
# Converting the 'Month' column to numeric format
df['Month'] = pd.to_datetime(df['Month'], format='%B').dt.month

In [None]:
print(sorted(df['Month'].unique()))

In [None]:
#Creating an object cursor to execute sql queries
cursor = conn.cursor()

In [None]:
#CHECKING FOR DUPLICATES IN INDEX
rowz= cursor.execute("SELECT* FROM ABCData GROUP BY ID HAVING COUNT(*) > 1").fetchall()
print(rowz)

In [None]:
#Now we want to confirm there are no duplicate rows in data\n",
df_removed = df.copy()

In [None]:
 df_removed.drop(columns=["ID"])

In [None]:
# Check for duplicate rows
duplicates = df_removed.duplicated()

# Print the number of duplicate rows
print(f"Number of duplicate rows: {duplicates.sum()}")

# If there are duplicates, show them
if duplicates.any():
    print("Duplicate rows:")
    print(df_removed[duplicates])
else:
    print("No duplicate rows found.")


In [None]:
%load_ext nb_black
# Checking for negative sales amounts\n",
invalid_sales = df[df["Sales"] < 0]
if not invalid_sales.empty:
    raise ValueError("Invalid sales amounts found.")

In [None]:
# Checking how many rows have negative values in the Quantity or Sales column\n",
rowz = cursor.execute(
    "SELECT COUNT(*) AS NegativeCount FROM ABCData WHERE Quantity < 0 OR Sales < 0"
).fetchall()
print(rowz)

In [None]:
# Replacing negative values with their absolute values
df["Quantity"] = df["Quantity"].abs()
df["Sales"] = df["Sales"].abs()
print(df[df["Quantity"] < 0])

In [None]:
rowz = cursor.execute(
    "SELECT COUNT(*) AS NegativeCount FROM ABCData WHERE Quantity = 0 OR Sales = 0"
).fetchall()
print(rowz)

In [None]:
# Remove rows where 'Quantity' or 'Sales' have zero values
df = df[(df['Quantity'] != 0) & (df['Sales'] != 0)]

# Verify the changes
print(f"Number of rows after removal: {len(df)}")


In [None]:
import geopandas as gpd
from shapely.geometry import Point

In [None]:
# Creating a geopandas Dataframe
geometry = [Point(xy) for xy in zip(df["Longitude"], df["Latitude"])]
gdf = gpd.GeoDataFrame(df, geometry=geometry)

In [None]:
gdf.head(3)

In [None]:
# Visualizing a plot of the geopandas dataframe
fig, ax = plt.subplots(1, figsize=(6, 12))
gdf.plot(ax=ax, column="Country", legend=True, legend_kwds={"loc": "center left"})
leg = ax.get_legend()
leg.set_bbox_to_anchor((1.04, 0.5))

* From this plot, we can see more concentration of sales from Germany 


In [None]:
import plotly.express as px

# let us plot a scatter plot showing Sales at various locations using longitude and latitude
fig = px.scatter(
    gdf,
    x="Longitude",
    y="Latitude",
    color="Sales",
    hover_name="City",
    title="Scatter Plot of Locations by Latitude and Longitude",
)

# Adjust plot size
fig.update_layout(
    width=800, height=600  # Set the width of the plot  # Set the height of the plot
)
fig.show()

In [None]:
print(df[df["Sales"] < 0])

In [None]:
# Calculate IQR bounds
Q1 = df["Quantity"].quantile(0.25)  # First quartile (25th percentile)
Q3 = df["Quantity"].quantile(0.75)  # Third quartile (75th percentile)
IQR = Q3 - Q1  # Interquartile range

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filtering out outliers
df_out = df[(df["Quantity"] >= lower_bound) & (df["Quantity"] <= upper_bound)]

In [None]:
# Creating a geopandas Dataframe after removing outliers
geometry = [Point(xy) for xy in zip(df_out["Longitude"], df_out["Latitude"])]
gdf_out = gpd.GeoDataFrame(df_out, geometry=geometry)

### Hypothesis Testing and Statistical Analysis

Hypothesis 1: Higher product prices correlate with lower sales quantities

In [None]:
# Let us see how product price affects sales quantity
# Checking for missing values in 'ProductPric' and 'OrderQuantity' columns
print(df[["Price", "Quantity"]].isnull().sum())

In [None]:
# Let us group the data by product and calculate the sum of order quantities
df_group = (
    df.groupby("ProductClass").agg({"Quantity": "sum", "Price": "mean"}).reset_index()
)

print(df_group.head())

In [None]:
# Calculate the Pearson correlation between ProductPrice and total OrderQuantity
correlation = df["Price"].corr(df["Quantity"])
print(f"Correlation between Product Price and Total Sales Quantity: {correlation}")

In [None]:
# Let us Calculate the Pearson correlation between the aggregated ProductPrice and total OrderQuantity
correlation = df_group['Price'].corr(df_group['Quantity'])
print(f'Correlation between Product Price and Total Sales Quantity: {correlation}')


In [None]:
from scipy.stats import pearsonr

# Calculate Pearson correlation
pearson_corr, p_value = pearsonr(df["Price"], df["Quantity"])

# Output results
print(f"Pearson Correlation: {pearson_corr}")
print(f"P-value: {p_value}")

*The Pearson Correlation Coefficient (-0.00058) is very close to 0, indicating that there's no linear relationship between Price and Quantity in the data.
The negative value suggests a slight impression of an inverse relationship, but it's so close to zero that it's essentially negligible.                                                              
*The P-value (0.7713) is much greater than the common significance threshold of 0.05, indicating that the correlation is not statistically significant.                  
*This means there's no evidence indicating a linear relationship between the Price and Sales quantity                            
*Overall, this result tells us that changes in price does not predict change in sales quantity and vice versa

In [None]:
# Visualizing correlation with linear regression
# Plotting a Scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(
    x="Price", y="Quantity", data=df, scatter_kws={"s": 10}, line_kws={"color": "red"},
)
plt.title("Regression Analysis of Product Price and Sales Quantity")
plt.xlabel("Product Price")
plt.ylabel("Sales Quantity")
plt.grid(True)
plt.show()

*The majority of the points are concentrated near the bottom of the plot between 0 and 20,000 sales quantity, indicating that most products have a low sales quantity regardless of their price.                          
*There are several data points with very high sales quantities, suggesting that some products are sold in exceptionally high volumes.                                                                   
*These high quantities appear across a range of prices but do not indicate a any particular pattern of correlation.
*The red regression line is nearly flat, reinforcing the earlier result of a weak or no linear relationship between price and quantity.                                                                        
*All in all, Product price does not seem to significantly influence sales quantity in the data, as no strong trend is observable.              

In [None]:
# Visualizing correlation with linear regression
# Plotting a Scatter plot with regression line
plt.figure(figsize=(10, 6))
sns.regplot(
    x="Price",
    y="Quantity",
    data=df_out,
    scatter_kws={"s": 10},
    line_kws={"color": "red"},
)
plt.title("Regression Analysis of Product Price and Sales Quantity")
plt.xlabel("Product Price")
plt.ylabel("Sales Quantity")
plt.grid(True)
plt.show()

*There's an absence of a clear upward or downward trend in the plot suggesting that changes in product price do not consistently correlate with variations in sales quantity. Other factors, such as product type, customer preference, or customer channel, might be better responsible for changes in sales quantity

### Hypothesis 2; Sales performance varies significantly across customer channels

In [None]:
#Performing ANOVA tests to determine sales performance across customer channel using the variables "Quantity" and "Sales" 

from scipy import stats
# Performing a one-way ANOVA test for total sales quantities across channels
anova_quantity = stats.f_oneway(*[group['Quantity'].values 
                                         for name, group in df.groupby('Channel')])

# Performing a one-way ANOVA test for sales across channels
anova_sales = stats.f_oneway(*[group['Sales'].values 
                                        for name, group in df.groupby('Channel')])

# Print results
print(f"ANOVA Result for Quantity: F-statistic = {anova_quantity.statistic}, p-value = {anova_quantity.pvalue}")
print(f"ANOVA Result for Sales: F-statistic = {anova_sales.statistic}, p-value = {anova_sales.pvalue}")


*For Quantity, The p-value (0.053) is slightly above the conventional threshold of 0.05.
This suggests there is weak evidence to conclude that the mean sales quantities differ significantly across the different customer channels.
In other words, the difference in sales quantities among the channels is not statistically significant at the 95% confidence level, but it is borderline.
*While for Sales, The p-value (0.033) is below 0.05, indicating that the differences in mean sales (revenue) across customer channels are statistically significant at the 95% confidence level.
This means that one customer channel's sales performance is significantly different from the other

### Hypothesis 3: Sales are higher in Urban areas compared to rural areas due to population density and healthcare infrastructure

In [None]:
# Here's a list of areas in the data
df["City"].unique()

In [None]:
#Here's a list of the urban cities in the data set according to the Statistics offices of Germany and Poland
Kraków
Warsaw
Gdańsk
Łódź
Wrocław
Poznań
Katowice
Rzeszów
Szczecin
Gdynia
Olsztyn
Radom
Tarnów
Częstochowa
Zabrze
Bytom
Gliwice
Elbląg
Toruń
Opole
Koszalin


Berlin
Munich (München)
Hamburg
Cologne (Köln)
Frankfurt
Stuttgart
Dortmund
Dresden
Leipzig
Bremen
Heidelberg
Osnabrück
Potsdam
Augsburg
Bonn
Lübeck
Karlsruhe
Kassel
Mannheim
Regensburg


In [None]:
# List of major urban cities in Germany and Poland
urban_cities = [
    "Berlin", "Hamburg", "Munich", "Cologne", "Frankfurt", 
    "Stuttgart", "Dortmund", "Leipzig", 
    "Bremen", "Dresden", "Heidelberg", "Osnabrück", "Potsdam",
    "Augsburg", "Bonn", "Lübeck",
    "Karlsruhe", "Kassel", "Mannheim", "Regensburg",
    "Warsaw", "Kraków", "Łódź", "Wrocław", "Poznań", 
    "Gdańsk", "Szczecin", "Katowice", "Gdynia",
    "Częstochowa", "Radom", "Rzeszów",
    "Olsztyn", "Tarnów", "Zabrze", "Bytom", "Gliwice", 
    "Elbląg", "Toruń", "Opole", "Koszalin"
]

# Classify each city as Urban or Rural
gdf['Urban_Rural'] = gdf['City'].apply(
    lambda city: 'Urban' if city in urban_cities else 'Rural'
)




In [None]:
print(gdf['Urban_Rural'].unique())


In [None]:
gdf.head(2)

In [None]:
gdf.to_csv(r"C:\Users\user\Desktop\3Signet Project2 Task\file3.csv")

#### Determining whether sales differ significantly between urban areas (where population density is higher with better healthcare infrastructure) and rural areas

In [None]:
#Performing a T-test to significant sales difference between urban and rural areas
from scipy.stats import ttest_ind

# Let us separate data into urban and rural groups
urban_sales = gdf[gdf['Urban_Rural'] == 'Urban']['Sales']
rural_sales = gdf[gdf['Urban_Rural'] == 'Rural']['Sales']

# Perform t-test
t_stat, p_value = ttest_ind(urban_sales, rural_sales, equal_var=False)
print(f"T-Statistic: {t_stat}, P-Value: {p_value}")



In [None]:
# Interpret results
if p_value < 0.05:
    print("Significant difference in sales between urban and rural areas.")
else:
    print("No significant difference in sales between urban and rural areas.")


In [None]:
import seaborn as sns
sns.boxplot(data=gdf, x="Urban_Rural", y="Sales")
plt.title("Sales Distribution: Urban vs Rural")
plt.show()


*The negative T-statistic indicates that, on average, the sales in urban areas are lower than in the rural areas
*The plot also shows same
*The p-value of 0.0034 indicates that the difference between the sales in urban and rural areas is statistically significant, meaning there's a significant difference between sales in urban areas and those in rural areas
*As the p-value is much less than 0.005, we can reject the hypothesis 3

## INSIGHTS AND RECOMMENDATIONS

*Pricing strategies might not be directly influencing sales volume in the business, or other factors (such as demand, marketing efforts, distribution channels, etc.) could be playing a more significant role.

*The business should consider focusing on factors other than price to drive sales, such as enhancing marketing campaigns, exploring customer preferences, and improving distribution channels especially in urban areas

*Business should also consider other strategies to increase sales, such as bundling products, offering discounts for bulk purchases and improve customer education on products

*The pharmacy channel is doing significantly better than the hospital channel

*Business should consider conducting channel-specific analysis to identify the most profitable customer segments and tailor marketing efforts accordingly.

*Business can focus on optimizing pharmacy channel to maximize sales while exploring strategies to improve the weaker channel.

*The urban areas, despite typically being the high-potential market due to population density, are not performing as well in terms of sales volume compared to rural areas. This could be due to a variety of factors, including market saturation, high competition, or logistical challenges.

*Business can reevaluate the pricing strategy and marketing efforts in urban areas. Urban areas may be oversaturated with competition, and therefore, targeting different value propositions or adjusting pricing strategies could help.

*Business should consider targeted promotions or loyalty programs and marketing campaign to boost sales in urban areas.
