

Read and concatenate CSVs

In [None]:
import pandas as pd
import glob
import psycopg2

csv_location = "bike-rental-starter-kit/data"
trip_data_files = glob.glob(rf"{csv_location}/*tripdata.csv")
trip_data_df = pd.concat([pd.read_csv(f) for f in trip_data_files], ignore_index=True)


Replacing whitespaces with "_"

In [66]:
trip_data_df.columns = [x.replace(" ", "_").lower() for x in trip_data_df.columns]


Inspecting first few rows of our new dataframe

In [67]:
trip_data_df.head()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,birth_year,gender
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Subscriber,1964.0,2
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Subscriber,1962.0,1
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Subscriber,1962.0,2
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Subscriber,1984.0,1
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,,0


Inspecting the default datatypes given by Pandas

In [68]:
trip_data_df.dtypes

trip_duration                int64
start_time                  object
stop_time                   object
start_station_id             int64
start_station_name          object
start_station_latitude     float64
start_station_longitude    float64
end_station_id               int64
end_station_name            object
end_station_latitude       float64
end_station_longitude      float64
bike_id                      int64
user_type                   object
birth_year                 float64
gender                       int64
dtype: object

Changing to other better suiting data types

In [69]:
convert_dict = {
    "trip_duration": "Int64",
    "start_station_id": "Int64",
    "end_station_id": "Int64",
    "bike_id": "Int64",
    "birth_year": "Int64",
    "start_station_name": "string",
    "end_station_name": "string",
    "user_type": "category",
    "gender": "category"
}
trip_data_df = trip_data_df.astype(convert_dict)

Also, we have to convert the date columns separately, because Pandas doesn’t have an astype("datetime") type.

In [70]:
trip_data_df["start_time"] = pd.to_datetime(trip_data_df["start_time"], errors="coerce")
trip_data_df["stop_time"] = pd.to_datetime(trip_data_df["stop_time"], errors="coerce")

trip_data_df.dtypes

trip_duration                       Int64
start_time                 datetime64[ns]
stop_time                  datetime64[ns]
start_station_id                    Int64
start_station_name         string[python]
start_station_latitude            float64
start_station_longitude           float64
end_station_id                      Int64
end_station_name           string[python]
end_station_latitude              float64
end_station_longitude             float64
bike_id                             Int64
user_type                        category
birth_year                          Int64
gender                           category
dtype: object

Next step, we gonna check, if there are any duplicate rows and if yes we gonna remove them

In [71]:
trip_data_df.duplicated().sum()


np.int64(0)

In [72]:
trip_data_df.drop_duplicates(inplace=True)

We are going to define an ID column using the indexes, so it will be easier to manage our data in the future.

In [73]:
trip_data_df["id"] = trip_data_df.index
trip_data_df.head()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,birth_year,gender,id
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Subscriber,1964.0,2,0
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Subscriber,1962.0,1,1
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Subscriber,1962.0,2,2
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Subscriber,1984.0,1,3
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,,0,4


Now, let's move forward with further analysis of our DataFrame

In [74]:
trip_data_df.describe()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_latitude,start_station_longitude,end_station_id,end_station_latitude,end_station_longitude,bike_id,birth_year,id
count,247584.0,247584,247584,247584.0,247584.0,247584.0,247584.0,247584.0,247584.0,247584.0,228585.0,247584.0
mean,885.630533,2016-07-29 05:55:07.541335040,2016-07-29 06:09:53.671073536,3207.065206,40.723121,-74.046438,3203.572553,40.722594,-74.045855,24935.260481,1979.335276,123791.5
min,61.0,2016-01-01 00:02:52,2016-01-01 00:08:54,3183.0,40.69264,-74.096937,147.0,40.692216,-74.096937,14552.0,1900.0,0.0
25%,248.0,2016-05-27 07:46:06,2016-05-27 07:54:40.249999872,3186.0,40.717732,-74.050656,3186.0,40.71654,-74.050444,24491.0,1974.0,61895.75
50%,390.0,2016-08-10 09:23:50,2016-08-10 09:34:32.500000,3201.0,40.721525,-74.044247,3199.0,40.721124,-74.043117,24609.0,1981.0,123791.5
75%,666.0,2016-10-05 17:25:05.500000,2016-10-05 17:33:00.750000128,3211.0,40.727596,-74.038051,3211.0,40.727224,-74.036486,24719.0,1986.0,185687.25
max,16329808.0,2016-12-31 23:44:50,2017-01-18 14:26:46,3426.0,40.752559,-74.032108,3426.0,40.801343,-73.95739,27274.0,2000.0,247583.0
std,35937.976494,,,26.955103,0.008199,0.011211,61.579494,0.007958,0.011283,748.469712,9.596809,71471.488861


In [75]:
trip_data_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247584 entries, 0 to 247583
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   trip_duration            247584 non-null  Int64         
 1   start_time               247584 non-null  datetime64[ns]
 2   stop_time                247584 non-null  datetime64[ns]
 3   start_station_id         247584 non-null  Int64         
 4   start_station_name       247584 non-null  string        
 5   start_station_latitude   247584 non-null  float64       
 6   start_station_longitude  247584 non-null  float64       
 7   end_station_id           247584 non-null  Int64         
 8   end_station_name         247584 non-null  string        
 9   end_station_latitude     247584 non-null  float64       
 10  end_station_longitude    247584 non-null  float64       
 11  bike_id                  247584 non-null  Int64         
 12  user_type       

Missing Data:

In [76]:
trip_data_df.isna().sum()

trip_duration                  0
start_time                     0
stop_time                      0
start_station_id               0
start_station_name             0
start_station_latitude         0
start_station_longitude        0
end_station_id                 0
end_station_name               0
end_station_latitude           0
end_station_longitude          0
bike_id                        0
user_type                    380
birth_year                 18999
gender                         0
id                             0
dtype: int64

The trip_duration column had a suspiciously large maximum also a very small minimum value.

In [77]:
trip_data_df["trip_minutes"]=round(trip_data_df["trip_duration"] / 60,2)
trip_data_df["trip_hours"]=round(trip_data_df["trip_duration"] / 3600,2)
trip_data_df[["trip_duration","trip_minutes","trip_hours"]].describe()

Unnamed: 0,trip_duration,trip_minutes,trip_hours
count,247584.0,247584.0,247584.0
mean,885.630533,14.760503,0.246006
std,35937.976494,598.966283,9.982777
min,61.0,1.02,0.02
25%,248.0,4.13,0.07
50%,390.0,6.5,0.11
75%,666.0,11.1,0.18
max,16329808.0,272163.47,4536.06


According to the Citi Bike data dictionary, any trips shorter than 60 seconds are likely false starts or users quickly re-docking the bike to ensure it’s secure.

Looking at the data, some trips are extremely long, e.g., 4,500 hours, which is roughly half a year. It’s plausible that an annual subscriber could have left a bike out for an extended period without returning it. Therefore, we will keep these unusually long trips for analysis.

However, trips under 60 seconds will be removed, in line with the official documentation.

In [78]:
trip_data_df.drop(trip_data_df[trip_data_df["trip_duration"] < 60].index, inplace=True)
trip_data_df[["trip_duration","trip_minutes","trip_hours"]].describe()

Unnamed: 0,trip_duration,trip_minutes,trip_hours
count,247584.0,247584.0,247584.0
mean,885.630533,14.760503,0.246006
std,35937.976494,598.966283,9.982777
min,61.0,1.02,0.02
25%,248.0,4.13,0.07
50%,390.0,6.5,0.11
75%,666.0,11.1,0.18
max,16329808.0,272163.47,4536.06


After analyzing the data further, we can see that the earliest birth year is 1900. This is quite odd because, considering this dataset was created in 2016, it would mean the cyclist would be over 100 years old. Let's fix this. We are going to drop every row where our cyclist is more than 80 years old.

In [79]:
trip_data_df.drop(trip_data_df[trip_data_df["birth_year"] < 1936].index, inplace=True)
trip_data_df["birth_year"].sort_values(ascending=True)[:5]

164274    1937
187190    1937
213567    1937
180373    1937
112336    1940
Name: birth_year, dtype: Int64

On the other hand, 16-year-olds can easily use a bicycle.

In [80]:
trip_data_df["birth_year"].sort_values(ascending=False)[:5]

101903    2000
101863    2000
101921    2000
94636     2000
94625     2000
Name: birth_year, dtype: Int64

We can also see that there are many missing values in the user_type and birth_year columns

In [81]:
missing_birth_year = trip_data_df[trip_data_df[["birth_year"]].isnull().any(axis=1)]
missing_birth_year.head()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,birth_year,gender,id,trip_minutes,trip_hours
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,,0,4,15.05,0.25
5,883,2016-01-01 01:03:28,2016-01-01 01:18:11,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24442,Customer,,0,5,14.72,0.25
22,988,2016-01-01 03:16:33,2016-01-01 03:33:02,3196,Riverview Park,40.744319,-74.043991,3209,Brunswick St,40.724176,-74.050656,24662,Customer,,0,22,16.47,0.27
53,3090,2016-01-01 11:07:15,2016-01-01 11:58:46,3203,Hamilton Park,40.727596,-74.044247,3203,Hamilton Park,40.727596,-74.044247,24444,Customer,,0,53,51.5,0.86
57,788,2016-01-01 11:50:30,2016-01-01 12:03:39,3210,Pershing Field,40.742677,-74.051789,3195,Sip Ave,40.730743,-74.063784,24573,Customer,,0,57,13.13,0.22


In [82]:
missing_birth_year["user_type"].value_counts()



user_type
Customer      15470
Subscriber     3529
Name: count, dtype: int64

In [83]:
missing_birth_year['gender'].value_counts()

gender
0    18999
1        0
2        0
Name: count, dtype: int64

In [84]:
missing_birth_year["user_type"].value_counts() /trip_data_df["user_type"].value_counts()

user_type
Customer      0.996714
Subscriber    0.015232
Name: count, dtype: float64

We can see that 99% of the customers don’t have a birth year, and their gender is also unknown. Therefore, it is reasonable to assume that people with missing birth year and unknown gender should be assigned the "Customer" role, while the others can be assumed to be "Subscribers." This is because a customer can use a bike for a maximum of three days, whereas a subscriber can use it for a year. It is more likely that annual subscribers have long-term plans and will provide more information about themselves.

In [85]:
# Assign 'customer' to rows where user_type is missing and either gender is 0 or birth_year is missing
cond_customer = trip_data_df["user_type"].isna() & ((trip_data_df["gender"] == 0) | trip_data_df["birth_year"].isna())
trip_data_df.loc[cond_customer, "user_type"] = "Customer"

# Assign 'subscriber' to rows where user_type is missing but birth_year is known and gender is not 0
cond_subscriber = trip_data_df["user_type"].isna() & trip_data_df["birth_year"].notna() & (trip_data_df["gender"] != 0)
trip_data_df.loc[cond_subscriber, "user_type"] = "Subscriber"

trip_data_df.isna().sum()

trip_duration                  0
start_time                     0
stop_time                      0
start_station_id               0
start_station_name             0
start_station_latitude         0
start_station_longitude        0
end_station_id                 0
end_station_name               0
end_station_latitude           0
end_station_longitude          0
bike_id                        0
user_type                      0
birth_year                 18999
gender                         0
id                             0
trip_minutes                   0
trip_hours                     0
dtype: int64

Now we only have to deal with the missing birth_year values. There are three possible approaches: we could drop the rows with missing values, fill them with a value such as the average birth year of the DataFrame, or leave them as they are.

In [86]:
trip_data_df["birth_year"].isna().sum() / len(trip_data_df)

np.float64(0.0767382119863318)

Only 7% of the data is missing. We can replace the missing values with an average estimate so that our dataset has no missing values.

In [87]:
avg_birth_year = int(trip_data_df["birth_year"].mean())
trip_data_df["birth_year"] = trip_data_df["birth_year"].fillna(avg_birth_year)
trip_data_df.isna().sum()

trip_duration              0
start_time                 0
stop_time                  0
start_station_id           0
start_station_name         0
start_station_latitude     0
start_station_longitude    0
end_station_id             0
end_station_name           0
end_station_latitude       0
end_station_longitude      0
bike_id                    0
user_type                  0
birth_year                 0
gender                     0
id                         0
trip_minutes               0
trip_hours                 0
dtype: int64

Lets look at gender:

In [88]:
trip_data_df.groupby(['user_type','gender']).count()['id']

  trip_data_df.groupby(['user_type','gender']).count()['id']


user_type   gender
Customer    0          15470
            1             36
            2             15
Subscriber  0           4431
            1         177160
            2          50470
Name: id, dtype: int64

There were only a few rows that we assigned to the Customer user type. This reflects the fact that many customers did not provide complete information. As a result, the Customer data may not be as reliable as the Subscriber data, and we should keep this in mind during analysis.

Since we dropped a few rows lets reset index

In [89]:
trip_data_df.reset_index(drop=True,inplace=True)
trip_data_df['ID'] = trip_data_df.index
trip_data_df.drop("id",axis=1,inplace=True)
trip_data_df.head()

Unnamed: 0,trip_duration,start_time,stop_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,birth_year,gender,trip_minutes,trip_hours,ID
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Subscriber,1964,2,6.03,0.1,0
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Subscriber,1962,1,3.33,0.06,1
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Subscriber,1962,2,3.37,0.06,2
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Subscriber,1984,1,4.13,0.07,3
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,1979,0,15.05,0.25,4


Now that we have finished cleaning the data, we will connect to a PostgreSQL server and upload our DataFrame into relational tables.
Firstly we create the tables:

In [None]:
#Connection setup with local PG:
config = {"host": "localhost",
          "port": 5432,
          "database": "postgres",
          "user": "postgres",
          "password": "postgres"}
import os

# Read the password from environment variable
password = os.getenv("PG_PASSWORD")
#Connection setup with supabase:
conn_string = f"postgresql://postgres.juqzmtaoicczafyaspmt:{password}@aws-1-eu-central-1.pooler.supabase.com:6543/postgres"

In [91]:


with psycopg2.connect(conn_string) as conn:
    try:
        with conn.cursor() as cur:
            cur.execute("""       
                CREATE SCHEMA IF NOT EXISTS bikeshare;

                CREATE TABLE IF NOT EXISTS bikeshare.stations (
                    id INT PRIMARY KEY,
                    name VARCHAR(100),
                    latitude DECIMAL,
                    longitude DECIMAL
                );

                CREATE TABLE IF NOT EXISTS bikeshare.gender (
                    id INT PRIMARY KEY,
                    gender_name VARCHAR(10)
                );

                CREATE TABLE IF NOT EXISTS bikeshare.users (
                    id INT PRIMARY KEY,
                    user_type VARCHAR(30),
                    gender INT REFERENCES bikeshare.gender(id),
                    birth_year INT
                );
                  CREATE TABLE IF NOT EXISTS bikeshare.dim_date (
                    date_id INT PRIMARY KEY,    
                    full_date DATE NOT NULL,      
                    year INT,
                    month INT,
                    day INT,
                    month_name VARCHAR(20),
                    day_name VARCHAR(20),
                    quarter INT
                );
                                
                CREATE TABLE IF NOT EXISTS bikeshare.trip_informations (
                    id SERIAL PRIMARY KEY,
                    trip_duration INT,
                    start_time TIMESTAMP,
                    stop_time TIMESTAMP,
                    start_station_id INT REFERENCES bikeshare.stations(id),
                    end_station_id INT REFERENCES bikeshare.stations(id),
                    bike_id INT,
                    user_id INT REFERENCES bikeshare.users(id),
                    date_key INT REFERENCES bikeshare.dim_date(date_id)    
                );
            """)
            print("Tables created successfully.")
    except psycopg2.DatabaseError as e:
        print("Database error:", e)

Tables created successfully.


Now we are going to define a new dataset to build the Date Dimension table for our database. This is very useful because it standardizes all date attributes across the data warehouse, making it easier to perform consistent time-based analysis.

In [92]:
import datetime
beginning = datetime.datetime(2016,1,1)
col_date = [beginning + datetime.timedelta(x) for x in range(366)]
date_df = pd.DataFrame(col_date, columns=["full_date"])
date_df["full_date"] = pd.to_datetime(date_df["full_date"], errors="coerce")
date_df.head()

Unnamed: 0,full_date
0,2016-01-01
1,2016-01-02
2,2016-01-03
3,2016-01-04
4,2016-01-05


Now we are going to generate the year, month, day, month name, day name and the quarter of the year:

In [93]:
date_df["year"]=date_df["full_date"].dt.year
date_df["month"] = date_df["full_date"].dt.month
date_df["day"] = date_df["full_date"].dt.day
date_df["month_name"] = date_df["full_date"].dt.strftime("%B")
date_df["day_name"] = date_df["full_date"].dt.strftime("%A")
date_df["quarter"] = date_df["full_date"].dt.quarter
date_df.head()
date_df.dtypes

full_date     datetime64[ns]
year                   int32
month                  int32
day                    int32
month_name            object
day_name              object
quarter                int32
dtype: object

Next, we are going to create the two keys that will join these two future tables together in the dataset:

In [94]:

date_df["date_id"] = date_df["full_date"].dt.strftime("%Y%m%d").astype("int64")
trip_data_df["date_key"] = trip_data_df["start_time"].dt.strftime("%Y%m%d").astype("int64")
date_df=date_df[["date_id","full_date","year","month","day","month_name","day_name","quarter"]]
date_df.tail(10)

Unnamed: 0,date_id,full_date,year,month,day,month_name,day_name,quarter
356,20161222,2016-12-22,2016,12,22,December,Thursday,4
357,20161223,2016-12-23,2016,12,23,December,Friday,4
358,20161224,2016-12-24,2016,12,24,December,Saturday,4
359,20161225,2016-12-25,2016,12,25,December,Sunday,4
360,20161226,2016-12-26,2016,12,26,December,Monday,4
361,20161227,2016-12-27,2016,12,27,December,Tuesday,4
362,20161228,2016-12-28,2016,12,28,December,Wednesday,4
363,20161229,2016-12-29,2016,12,29,December,Thursday,4
364,20161230,2016-12-30,2016,12,30,December,Friday,4
365,20161231,2016-12-31,2016,12,31,December,Saturday,4


“We will also create a separate dataframe for the stations, so that we can upload them into the stations table in our SQL database

In [95]:
start_stations_df=trip_data_df[["start_station_id","start_station_name","start_station_latitude","start_station_longitude"]].drop_duplicates().reset_index(drop=True)
end_stations_df = trip_data_df[["end_station_id","end_station_name","end_station_latitude","end_station_longitude"]].drop_duplicates().reset_index(drop=True)
start_stations_df.columns = ["station_id", "station_name", "latitude", "longitude"]
end_stations_df.columns = ["station_id", "station_name", "latitude", "longitude"]
stations_df=pd.concat([start_stations_df,end_stations_df]).drop_duplicates().reset_index(drop=True)
stations_df.tail()


Unnamed: 0,station_id,station_name,latitude,longitude
97,2004,6 Ave & Broome St,40.724399,-74.004704
98,393,E 5 St & Avenue C,40.722992,-73.979955
99,401,Allen St & Rivington St,40.720196,-73.989978
100,376,John St & William St,40.708621,-74.007222
101,224,Spruce St & Nassau St,40.711464,-74.005524


In order to ensure each user is unique, we extracted the "birth_year", "user_type", and "gender" columns into a separate dataframe, dropped any duplicate rows, added a user_id column, and then merged it back with the original dataframe. This way, each user now has a unique identifier for analysis.


In [96]:
users_df=trip_data_df[["birth_year","user_type","gender"]].drop_duplicates().reset_index(drop=True)
users_df["user_id"]=range(1,len(users_df)+1)
if "user_id" not in trip_data_df.columns:
    trip_data_df=trip_data_df.merge(users_df, on= ["birth_year","user_type","gender"],how="left")
users_df.head()    

Unnamed: 0,birth_year,user_type,gender,user_id
0,1964,Subscriber,2,1
1,1962,Subscriber,1,2
2,1962,Subscriber,2,3
3,1984,Subscriber,1,4
4,1979,Customer,0,5


Now we are going to use the SQL INSERT command to populate our tables with the cleaned DataFrames:

In [None]:
from psycopg2.extras import execute_values
import psycopg2

def insert_data(table_name, columns, dataframe, conn_string):
    """
    Inserts all rows from a pandas DataFrame into a PostgreSQL table.

    table_name: str, name of the table
    columns: tuple of column names e.g. ("id", "name", "latitude")
    dataframe: pandas DataFrame
    con_string: psycopg2 connection parameters
    """
    try:
        with psycopg2.connect(conn_string) as conn:
            with conn.cursor() as cur:

                # Count rows before
                cur.execute(f"SELECT COUNT(*) FROM {table_name};")
                before = cur.fetchone()[0]

                # Convert all types to native Python types
                values = [
                    tuple(v.item() if hasattr(v, "item") else v for v in row)
                    for row in dataframe.itertuples(index=False)
                ]

                # Prepare SQL
                col_str = ', '.join(columns)
                sql = f"INSERT INTO {table_name} ({col_str}) VALUES %s"

                # Bulk insert
                execute_values(cur, sql, values)

                # Count rows after
                cur.execute(f"SELECT COUNT(*) FROM {table_name};")
                after = cur.fetchone()[0]

                inserted = after - before
                if inserted != len(values):
                    raise psycopg2.DatabaseError(
                        f"Expected {len(values)} rows inserted, got {inserted}"
                    )

                print(f"Rows inserted: {inserted}")

    except psycopg2.DatabaseError as e:
        print("Database error:", e)


In [98]:
insert_data(
    "bikeshare.stations",
    ("id", "name", "latitude", "longitude"),
    stations_df,
    conn_string
)


Rows inserted: 102


In [99]:
try:
    with psycopg2.connect(conn_string) as conn:
        with conn.cursor() as cur:
            cur.execute("""
                INSERT INTO bikeshare.gender (id, gender_name)
                VALUES
                    (0, 'Unknown'),
                    (1, 'Male'),
                    (2, 'Female');
            """)
except psycopg2.DatabaseError as e:
    print(e)


In [100]:
insert_data(
    "bikeshare.users",
    ("birth_year","user_type","gender","id"),
    users_df,
    conn_string
)



Rows inserted: 151


In [101]:
insert_data(
    "bikeshare.dim_date",
    ("date_id", "full_date", "year", "month", "day", "month_name", "day_name", "quarter"),
    date_df,
    conn_string
)

Rows inserted: 366


For the trip_informations column we create a new datafrom so we can isnert it into our insert_data definition

In [102]:
trip_inf_df=trip_data_df[["trip_duration","start_time","stop_time" ,"start_station_id"	,"end_station_id","bike_id","user_id","date_key"]].copy()

insert_data(
    "bikeshare.trip_informations",
    ("trip_duration","start_time","stop_time" ,"start_station_id","end_station_id","bike_id","user_id","date_key"),
    trip_inf_df,
    conn_string
)


Rows inserted: 247582


Creating views:

In [103]:
import psycopg2

try:
    with psycopg2.connect(conn_string) as conn:
        with conn.cursor() as cur:

            #You can see which bikes are the most popular, how long they are used on average, and the total time each bike has been ridden
            cur.execute("""
                CREATE OR REPLACE VIEW bikeshare.bike_usage AS
                SELECT 
                    bike_id,
                    COUNT(id) AS total_trips,
                    ROUND(AVG(trip_duration)::numeric, 2) AS avg_trip_duration,
                    ROUND(SUM(trip_duration)::numeric, 2) AS total_duration
                FROM bikeshare.trip_informations
                GROUP BY bike_id;
            """)

            #Useful to understand user behavior, identify heavy users, and spot patterns across different user types.
            cur.execute("""
                CREATE OR REPLACE VIEW bikeshare.user_trip_stats AS
                SELECT 
                    u.id AS user_id,
                    u.user_type,
                    g.gender_name,
                    COUNT(t.id) AS total_trips,
                    ROUND(AVG(t.trip_duration)::numeric, 2) AS avg_trip_duration,
                    ROUND(SUM(t.trip_duration)::numeric, 2) AS total_duration
                FROM bikeshare.users u
                LEFT JOIN bikeshare.gender g ON u.gender = g.id
                LEFT JOIN bikeshare.trip_informations t ON t.user_id = u.id
                GROUP BY u.id, u.user_type, g.gender_name;
            """)
            #Helps with station capacity planning and identifying busy or underused stations.
            cur.execute("""
                CREATE OR REPLACE VIEW bikeshare.station_usage AS
                SELECT 
                    s.id AS station_id,
                    s.name AS station_name,
                    COUNT(t.id) FILTER (WHERE t.start_station_id = s.id) AS trips_started,
                    COUNT(t.id) FILTER (WHERE t.end_station_id = s.id) AS trips_ended
                FROM bikeshare.stations s
                LEFT JOIN bikeshare.trip_informations t
                    ON t.start_station_id = s.id OR t.end_station_id = s.id
                GROUP BY s.id, s.name;
            """)
            #Offers a complete view of the system’s activity for reporting or analytics purposes.
            cur.execute("""
                CREATE OR REPLACE VIEW bikeshare.trip_summary AS
                SELECT 
                    t.id AS trip_id,
                    u.id AS user_id,
                    u.user_type,
                    g.gender_name,
                    u.birth_year,
                    ROUND(t.trip_duration::numeric, 2) AS trip_duration,
                    t.start_time,
                    t.stop_time,
                    s_start.name AS start_station,
                    s_end.name AS end_station,
                    t.bike_id
                FROM bikeshare.trip_informations t
                JOIN bikeshare.users u ON t.user_id = u.id
                JOIN bikeshare.gender g ON u.gender = g.id
                JOIN bikeshare.stations s_start ON t.start_station_id = s_start.id
                JOIN bikeshare.stations s_end ON t.end_station_id = s_end.id;
            """)
            #Summarizes daily bike trips by date, showing total trips, average and total trip duration, and unique users.

            cur.execute("""
                CREATE OR REPLACE VIEW bikeshare.daily_trip_summary AS
                SELECT
                    d.date_id,
                    d.full_date,
                    d.year,
                    d.month,
                    d.day,
                    d.month_name,
                    d.day_name,
                    d.quarter,
                    COUNT(t.id) AS total_trips,
                    CAST(ROUND(AVG(t.trip_duration)::numeric, 2) AS numeric) AS avg_trip_duration,
                    CAST(ROUND(SUM(t.trip_duration)::numeric, 2) AS numeric) AS total_trip_duration,
                    COUNT(DISTINCT t.user_id) AS unique_users
                FROM bikeshare.dim_date d
                LEFT JOIN bikeshare.trip_informations t
                    ON t.date_key = d.date_id
                GROUP BY
                    d.date_id, d.full_date, d.year, d.month, d.day, d.month_name, d.day_name, d.quarter
                ORDER BY d.full_date;

            """)


            print("All views created")

except psycopg2.DatabaseError as e:
    print("Database error:", e)
except Exception as e:
    print("Unexpected error:", e)


All views created
