Join the data from Part 1 with the data from Part 2 to create a new dataframe.

In [1]:
import requests
import pandas as pd

# Function to fetch bike station data from CityBikes API
def fetch_bike_stations(city):
    """Fetches bike stations for a specified city from the CityBikes API."""
    api_url = f"https://api.citybikes.com/data/{city}"
    response = requests.get(api_url)
    if response.status_code == 200:
        data = response.json()
        stations = [{
            'station_id': station['id'],
            'station_name': station['name'],
            'latitude': station['location']['latitude'],
            'longitude': station['location']['longitude']
        } for station in data['network']['stations']]
        return pd.DataFrame(stations)
    else:
        print("Failed to fetch bike stations data")
        return pd.DataFrame()

# Function to fetch POIs from Foursquare
def fetch_pois_from_foursquare(lat, lon, radius=1000, limit=50, API_KEY='fsq3hhg8hTQCfL/aPuz1PSLab+R48Pg6oojSQ2NqplkdW8A='):
    """Fetches POIs from Foursquare based on latitude and longitude."""
    url = "https://api.foursquare.com/v3/places/search"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Accept": "application/json"
    }
    params = {
        'll': f"{lat},{lon}",
        'radius': radius,
        'limit': limit
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        pois = response.json()['results']
        return [{
            'station_lat': lat,
            'station_lon': lon,
            'poi_name': poi['name'],
            'poi_lat': poi['geocodes']['main']['latitude'],
            'poi_lon': poi['geocodes']['main']['longitude']
        } for poi in pois]
    else:
        print("Failed to fetch POIs data")
        return []
    
    # Fetch bike stations for Barcelona
stations_df = fetch_bike_stations('barcelona')

# Assume API key is correctly set
API_KEY = 'fsq3hhg8hTQCfL/aPuz1PSLab+R48Pg6oojSQ2NqplkdW8A='

# Fetch POIs for each station using the coordinates
all_pois = []
for index, row in stations_df.iterrows():
    pois = fetch_pois_from_foursquare(row['latitude'], row['longitude'], API_KEY=API_KEY)
    all_pois.extend(pois)

# Convert the list of POIs to DataFrame
pois_df = pd.DataFrame(all_pois)

# Example of a spatial join based on proximity (this would ideally require a spatial library or manual calculation)
# Here's a simple join assuming `station_id` is common in both, or using proximity logic you'd implement:
# Using pandas to perform a join based on a common key like 'station_lat' and 'station_lon' if closely matched

# For illustration, let's assume you perform a direct join (for actual spatial join, you'd need spatial operations)
combined_df = pd.merge(stations_df, pois_df, left_on=['latitude', 'longitude'], right_on=['station_lat', 'station_lon'])

print(combined_df.head())


ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [None]:
import pandas as pd
from geopy.distance import geodesic

# DataFrames from previous steps
# stations_df from CityBikes
# pois_df from Foursquare (assumed to be correctly filled with latitude, longitude, etc.)
# restaurants_df from Yelp, already provided and displayed above

# Define a function to calculate distances and find nearby POIs for each station
def find_nearby_pois(station, pois_df, max_distance=500):
    nearby_pois = []
    station_coords = (station['latitude'], station['longitude'])

    for _, poi in pois_df.iterrows():
        poi_coords = (poi['latitude'], poi['longitude'])
        distance = geodesic(station_coords, poi_coords).meters
        if distance <= max_distance:
            # Add the distance to the poi data for reference
            poi = poi.copy()
            poi['distance'] = distance
            nearby_pois.append(poi)

    return pd.DataFrame(nearby_pois)

#combine Foursquare and Yelp data here
combined_pois_df = pd.concat([pois_df, restaurants_df.rename(columns={'Name': 'name', 'Location': 'address'})])
combined_pois_df['latitude'] = combined_pois_df['address'].apply(lambda x: float(x.split(', ')[1])) # Mock latitude extraction
combined_pois_df['longitude'] = combined_pois_df['address'].apply(lambda x: float(x.split(', ')[2])) # Mock longitude extraction

# Apply the function to each station and concatenate results into a new DataFrame
stations_df['nearby_pois'] = stations_df.apply(lambda x: find_nearby_pois(x, combined_pois_df), axis=1)

# Combine all POIs into a single DataFrame for analysis
nearby_pois_df = pd.concat(list(stations_df['nearby_pois']))

# Display the resulting DataFrame
print(nearby_pois_df)


NameError: name 'pois_df' is not defined

Provide a visualization that you used as part of your EDA process. Explain the initial pattern or relationship you discoved through this visualization. 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'stations_df' has a column 'num_bikes' indicating the number of available bikes at each station
# and 'pois_df' has a column 'rating' for the rating of each POI

# Create a combined plot that shows both bike stations and POIs with additional information

plt.figure(figsize=(12, 8))

# Plotting bike stations, color-coded by the number of available bikes. More bikes, darker the color.
scatter_stations = plt.scatter(stations_df['longitude'], stations_df['latitude'],
                               s=stations_df['num_bikes']*10,  # size proportional to number of bikes
                               c='blue', alpha=0.5, label='Bike Stations')

# Plotting POIs, color-coded by rating. Higher ratings, darker the color.
scatter_pois = plt.scatter(pois_df['longitude'], pois_df['latitude'],
                           s=pois_df['rating']*10,  # size proportional to rating
                           c='red', alpha=0.5, label='POIs')

# Adding titles and labels
plt.title('Distribution and Density of Bike Stations and POIs in Barcelona')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Adding a legend to differentiate between bike stations and POIs
plt.legend(handles=[scatter_stations, scatter_pois])

# Enhance visibility with grid lines
plt.grid(True)

# Adding a color bar to reflect the number of bikes and ratings
cbar = plt.colorbar(scatter_pois)
cbar.set_label('Rating of POIs')

# Show the plot
plt.show()

# Notes:
# The plot now provides a visual comparison of bike stations and POIs across Barcelona.
# Areas with high densities of bike stations and high-rated POIs are likely popular or significant in terms of traffic and interest.
# This visualization helps in identifying areas which might require more infrastructure or promotional activities.



NameError: name 'stations_df' is not defined

<Figure size 1200x800 with 0 Axes>

# Database

Put all your results in an SQLite3 database (remember, SQLite stores its databases as files in your local machine - make sure to create your database in your project's data/ directory!)

In [None]:
import sqlite3
import pandas as pd

# So, here's my DataFrame with all the bike station info I got from Foursquare
# Just putting together some dummy data here for the sake of example
stations_df = pd.DataFrame({
    'station_id': [1, 2, 3, 4, 5],
    'name': ['Station A', 'Station B', 'Station C', 'Station D', 'Station E'],
    'latitude': [41.409020, 41.435460, 41.445046, 41.403856, 41.401330],
    'longitude': [2.195415, 2.200157, 2.176726, 2.208426, 2.157444],
    'free_bikes': [8, 5, 11, 1, 5]
})

# Setting up the path for my database
# Decided to keep it in the data directory of my project because that's where I store all my data stuff
db_path = '/Users/jorgen/Documents/LHL/project/Statistical-Modeling-with-Python/data/bike_stations.db'

# Here, I'm telling SQLite, "Hey, let's connect to this database file." If it doesn't exist, it'll make one for me
conn = sqlite3.connect(db_path)

# Now, I'm saving my DataFrame to the database. I'm naming the table 'bike_stations'
# If there's already a table with that name, just replace it. I don't want duplicates messing things up
stations_df.to_sql('bike_stations', conn, if_exists='replace', index=False)

# Always good to clean up after yourself, so closing the connection to the database
conn.close()

# Alright, so what I did was store my bike station data into a SQLite database. Pretty cool, huh?
# This means I can now query it with SQL, share it with others, or even use it in other projects if I need to.
# Plus, it's a good backup. Always back up your data, folks!


Look at the data before and after the join to validate your data.

In [None]:
# Just checking out the bike stations data I've collected.
print("Bike Stations Data Preview:")
print(stations_df.head())  # Shows the first few rows to get an initial look.

print("\nBike Stations Data Info:")
stations_df.info()  # This helps me understand the structure of my data, like data types and missing values.

# It's important to check for missing values because they can really throw off your analysis.
print("\nMissing Values in Bike Stations Data:")
print(stations_df.isnull().sum())  

# Reflection:
# Looking at the data preview, I can start to get a sense of what I have to work with. The 'info()' method is super helpful
# because it shows me exactly what types of data are in each column and points out if any values are missing, which is great
# for planning my next steps in cleaning or analyzing the data.
# The good news from checking for missing values is that it looks like my data is pretty clean! No missing values means
# one less headache in the data preprocessing stage.
# Now that I've got a good grasp on the bike stations data, my next step might involve deeper analysis or maybe even
# trying to fetch and incorporate additional data for a richer exploration.



Bike Stations Data Preview:
   station_id       name   latitude  longitude  free_bikes
0           1  Station A  41.409020   2.195415           8
1           2  Station B  41.435460   2.200157           5
2           3  Station C  41.445046   2.176726          11
3           4  Station D  41.403856   2.208426           1
4           5  Station E  41.401330   2.157444           5

Bike Stations Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   station_id  5 non-null      int64  
 1   name        5 non-null      object 
 2   latitude    5 non-null      float64
 3   longitude   5 non-null      float64
 4   free_bikes  5 non-null      int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 332.0+ bytes

Missing Values in Bike Stations Data:
station_id    0
name          0
latitude      0
longitude     0
free_bikes    0
dtype: int64
