In [1]:
import pandas as pd
from datetime import datetime

In [2]:
df = pd.read_csv('sales.csv')
df.head(2)

Unnamed: 0,SaleID,Region,Amount,Date
0,1,North,500,2024-01-01
1,2,South,300,2024-02-15


In [4]:
tr_count_region = df.groupby('Region')['SaleID'].count()
tr_sales_region = df.groupby('Region')['Amount'].sum()

print(f"Transacstions count by Region:\n{tr_count_region}\nTransacstions sales by Region:\n{tr_sales_region}")

Transacstions count by Region:
Region
North    2
South    2
West     1
Name: SaleID, dtype: int64
Transacstions sales by Region:
Region
North    1200
South    1300
West      400
Name: Amount, dtype: int64


In [19]:
tr_sales_region_df = tr_sales_region.reset_index()
top_sales_region = tr_sales_region_df[(tr_sales_region_df['Amount'] == tr_sales_region_df['Amount'].max())]
print(f"Region with top sales: {top_sales_region['Region']}")


Region with top sales: 1    South
Name: Region, dtype: object


In [21]:
tr_count_region_df  = tr_count_region.reset_index() 
lowest_tr_count_region = tr_count_region_df[(tr_count_region_df['SaleID']) ==tr_count_region_df['SaleID'].min()]
print(f"Region with lowest sales count: {lowest_tr_count_region['Region']}")

Region with lowest sales count: 2    West
Name: Region, dtype: object


In [27]:
total_sales = df['Amount'].sum()

pct_by_region = (tr_sales_region / total_sales) * 100
region_df = pct_by_region.reset_index()
region_df.columns = ['Region', 'Pct_Region']
region_df



Unnamed: 0,Region,Pct_Region
0,North,41.37931
1,South,44.827586
2,West,13.793103


In [40]:
df = df.drop(['Pct_Region_y', 'Pct_Region_x'], axis=1)
df

Unnamed: 0,SaleID,Region,Amount,Date,Pct_Region
0,1,North,500,2024-01-01,41.38
1,2,South,300,2024-02-15,44.83
2,3,North,700,2024-03-10,41.38
3,4,West,400,2024-01-20,13.79
4,5,South,1000,2024-03-25,44.83


In [42]:
#4. Filtrowanie danych

tr_amount_h_700 = df[df['Amount'] > 700]
print(tr_amount_h_700)

tr_north_region = df[df['Region'] == "North"]
print(tr_north_region)

   SaleID Region  Amount        Date  Pct_Region
4       5  South    1000  2024-03-25       44.83
   SaleID Region  Amount        Date  Pct_Region
0       1  North     500  2024-01-01       41.38
2       3  North     700  2024-03-10       41.38


In [43]:
df['Date'] = pd.to_datetime(df['Date'])

In [48]:
may_df = pd.read_csv('sales.csv')

may_df['Date'] = pd.to_datetime(may_df['Date'])

In [50]:
tr_may_2024 = may_df[(may_df['Date'].dt.month == 5) & (may_df['Date'].dt.year == 2024)]
tr_may_2024

Unnamed: 0,SaleID,Region,Amount,Date
5,6,South,1000,2024-05-25


In [51]:
#3. Tworzenie nowych kolumn
def get_discount(amount):
    if amount > 500:
        return amount * 0.9
    else:
        return amount
    
def categorize(amount):
    if amount < 300:
        return "Low"
    elif amount >= 300 and amount <= 700:
        return "Medium"
    elif amount > 700:
        return "High"

df['DiscountedAmount'] = df['Amount'].apply(get_discount)
df['Category'] = df['Amount'].apply(categorize)


In [52]:
df

Unnamed: 0,SaleID,Region,Amount,Date,Pct_Region,DiscountedAmount,Category
0,1,North,500,2024-01-01,41.38,500.0,Medium
1,2,South,300,2024-02-15,44.83,300.0,Medium
2,3,North,700,2024-03-10,41.38,630.0,Medium
3,4,West,400,2024-01-20,13.79,400.0,Medium
4,5,South,1000,2024-03-25,44.83,900.0,High


In [None]:
# df['Category'] = pd.cut(
#     df['Amount'],
#     bins=[-float('inf'), 300, 700, float('inf')],  # Przedziały
#     labels=['Low', 'Medium', 'High']              # Etykiety
# )


In [59]:
#2. Praca z czasem
#using different dates (2024-02-01 and 2024-04-01 to align with my dataset)
start_date = datetime(2024, 2, 1)
end_date = datetime(2024, 4,1)

tr_dates = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
tr_dates = tr_dates.sort_values(by='Date')
tr_dates

Unnamed: 0,SaleID,Region,Amount,Date,Pct_Region,DiscountedAmount,Category,Weekday
1,2,South,300,2024-02-15,44.83,300.0,Medium,Thursday
2,3,North,700,2024-03-10,41.38,630.0,Medium,Sunday
4,5,South,1000,2024-03-25,44.83,900.0,High,Monday


In [60]:
dates_diff = tr_dates['Date'].max() - tr_dates['Date'].min()
print(f"Różnica między datami wynosi: {dates_diff.days} dni.")


Różnica między datami wynosi: 39 dni.


In [58]:
df['Weekday'] = df['Date'].dt.day_name()
df

Unnamed: 0,SaleID,Region,Amount,Date,Pct_Region,DiscountedAmount,Category,Weekday
0,1,North,500,2024-01-01,41.38,500.0,Medium,Monday
1,2,South,300,2024-02-15,44.83,300.0,Medium,Thursday
2,3,North,700,2024-03-10,41.38,630.0,Medium,Sunday
3,4,West,400,2024-01-20,13.79,400.0,Medium,Saturday
4,5,South,1000,2024-03-25,44.83,900.0,High,Monday
