Build a regression model.

In [3]:
import sys
print(sys.executable)


c:\Users\Admin\anaconda3\envs\bootcamp\python.exe


In [7]:
import requests
import pandas as pd

url = "https://api.citybik.es/v2/networks/bicing"
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    stations = data['network']['stations']
    
    bike_station_data = []
    for station in stations:
        station_info = {
            'station': station['name'],
            'latitude': station['latitude'],
            'longitude': station['longitude'],
            'free_bikes': station['free_bikes']
        }
        bike_station_data.append(station_info)
    
    bike_df = pd.DataFrame(bike_station_data)
    
    print(bike_df.head())
else:
    print(f"Failed to retrieve CityBikes data. Status code: {response.status_code}")


                           station   latitude  longitude  free_bikes
0   PL  JOANIC - C / BRUNIQUER, 59  41.405520   2.162255           1
1                C/ SARDENYA,  178  41.396717   2.182508          15
2                 PL. CATALUNYA, 7  41.386543   2.169427           8
3                C/ VILADOMAT, 200  41.384810   2.150807           5
4        C/ DEL DOCTOR TRUETA, 222  41.399217   2.204141           6


In [8]:
import requests
import time

yelp_url = "https://api.yelp.com/v3/businesses/search"
headers = {
    "Authorization": "Bearer M2nNMPI4L-7WEcuB0uVLlNLLW0qrwURgQC8Jc51xjhPs3fHCvH0C3f5IMTzTwnHqHZu88A1HQKjLzRrEZ3u4ocWeA7rRQj8YwCA1CNzo0i5MoOcev_LsBrt9_0D7Z3Yx"
}

yelp_results = []

for _, row in bike_df.iterrows():
    lat = row['latitude']
    lon = row['longitude']
    station_name = row['station']

    params = {
        "latitude": lat,
        "longitude": lon,
        "radius": 1000,
        "categories": "restaurants",
        "limit": 50
    }

    response = requests.get(yelp_url, headers=headers, params=params)
    
    if response.status_code == 200:
        data = response.json()
        for business in data.get("businesses", []):
            yelp_results.append({
                "station": station_name,
                "name": business.get("name"),
                "category": business["categories"][0]["title"] if business.get("categories") else None,
                "latitude": business["coordinates"]["latitude"],
                "longitude": business["coordinates"]["longitude"],
                "distance_m": business.get("distance"),
                "rating": business.get("rating"),
                "review_count": business.get("review_count")
            })
    else:
        print(f"Request failed for station {station_name}, Status Code: {response.status_code}")
    

yelp_df = pd.DataFrame(yelp_results)

print(yelp_df.head())

                           station                           name  \
0   PL  JOANIC - C / BRUNIQUER, 59  Rabipelao Gracia- Le Rabipelè   
1   PL  JOANIC - C / BRUNIQUER, 59            Cerveseria Catalana   
2   PL  JOANIC - C / BRUNIQUER, 59                Taverna El Glop   
3   PL  JOANIC - C / BRUNIQUER, 59                      La Pepita   
4   PL  JOANIC - C / BRUNIQUER, 59                        Lasarte   

         category   latitude  longitude   distance_m  rating  review_count  
0  Latin American  41.403798   2.159753   283.212970     4.5            30  
1         Spanish  41.392393   2.160993  1463.421413     4.3          1769  
2         Spanish  41.404930   2.159590   230.438099     4.2           158  
3      Tapas Bars  41.397956   2.161058   846.958136     4.6           623  
4         Spanish  41.393650   2.162160  1324.438629     4.6            82  


In [9]:
import pandas as pd
combined_df = pd.merge(
    yelp_df,
    bike_df,       
    how="left",
    on="station",
    suffixes=('_restaurant', '_station')
)
combined_df.head()


Unnamed: 0,station,name,category,latitude_restaurant,longitude_restaurant,distance_m,rating,review_count,latitude_station,longitude_station,free_bikes
0,"PL JOANIC - C / BRUNIQUER, 59",Rabipelao Gracia- Le Rabipelè,Latin American,41.403798,2.159753,283.21297,4.5,30,41.40552,2.162255,1
1,"PL JOANIC - C / BRUNIQUER, 59",Cerveseria Catalana,Spanish,41.392393,2.160993,1463.421413,4.3,1769,41.40552,2.162255,1
2,"PL JOANIC - C / BRUNIQUER, 59",Taverna El Glop,Spanish,41.40493,2.15959,230.438099,4.2,158,41.40552,2.162255,1
3,"PL JOANIC - C / BRUNIQUER, 59",La Pepita,Tapas Bars,41.397956,2.161058,846.958136,4.6,623,41.40552,2.162255,1
4,"PL JOANIC - C / BRUNIQUER, 59",Lasarte,Spanish,41.39365,2.16216,1324.438629,4.6,82,41.40552,2.162255,1


In [10]:
import statsmodels.api as sm
import pandas as pd

df_model = combined_df.copy()
df_model = pd.get_dummies(df_model, columns=['category'], drop_first=True)

features = ['rating', 'review_count', 'distance_m'] + \
           [col for col in df_model.columns if col.startswith('category_')]

X = df_model[features]
y = df_model['free_bikes']

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())


                            OLS Regression Results                            
Dep. Variable:             free_bikes   R-squared:                       0.028
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                     6.828
Date:                Mon, 14 Apr 2025   Prob (F-statistic):           1.51e-93
Time:                        19:38:22   Log-Likelihood:                -86486.
No. Observations:               25813   AIC:                         1.732e+05
Df Residuals:                   25703   BIC:                         1.741e+05
Df Model:                         109                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
cons

Provide model output and an interpretation of the results. 

##  OLS Regression Model Summary

**Dependent Variable**: `free_bikes`  
**Model Type**: Ordinary Least Squares (OLS)  
**Number of Observations**: 25,813  
**Number of Predictors**: 109  
**R-squared**: 0.028  
**Adjusted R-squared**: 0.024  
**F-statistic**: 6.828 (p < 0.001)  
**Condition Number**: 1.81e+06 ( indicates potential multicollinearity)

---

###  Key Coefficients

| Variable       | Coefficient | P-value  | Interpretation |
|----------------|-------------|----------|----------------|
| `Intercept`    | 12.27       | 0.076    | Average bikes when all predictors are zero |
| `rating`       | **+0.52**   | **< 0.001** ✅ | Each additional star increases bike availability by ~0.52 |
| `review_count` | 0.0001      | 0.675 ❌  | Not statistically significant |
| `distance_m`   | 0.00006     | 0.434 ❌  | Not statistically significant |
| POI Categories | Mostly negative | > 0.05 ❌ | None were statistically significant |

---

###  Model Interpretation

-  **Higher POI ratings** are **positively associated** with more available bikes.
-  **Review counts** and **distance to POIs** do **not significantly affect** bike availability.
-  **POI categories** (like Tapas Bars, Sushi, etc.) also **do not significantly predict** bike availability.

### Limitations

- The model explains **only 2.8% of the variance** in free bikes.
- Possible missing variables: **time of day**, **day of week**, **weather**, **event schedules**, etc.
- Condition number is high, suggesting **multicollinearity** — predictors may be correlated.

---

###  Insight Summary

| ✅ Helpful                   | ❌ Not Helpful             |
|----------------------------|----------------------------|
| POI rating (statistically significant) | POI category (not significant) |
| Large dataset (25k+ rows)  | Weak predictive power (R² = 0.028) |


# Stretch

How can you turn the regression model into a classification model?

In [None]:
import pandas as pd

def bike_availability_class(x):
    if x <= 3:
        return 0  # Low
    elif x <= 7:
        return 1  # Medium
    else:
        return 2  # High

df_model['availability_class'] = df_model['free_bikes'].apply(bike_availability_class)


In [12]:

X = df_model.drop(columns=['free_bikes', 'availability_class'])

y = df_model['availability_class']


In [15]:
print(X_train.dtypes)


station                  object
name                     object
latitude_restaurant     float64
longitude_restaurant    float64
distance_m              float64
                         ...   
category_Vegetarian       uint8
category_Venezuelan       uint8
category_Vietnamese       uint8
category_Wine Bars        uint8
category_Wok              uint8
Length: 115, dtype: object


In [22]:
from sklearn.model_selection import train_test_split

X = df_model.drop(columns=['free_bikes', 'availability_class'])  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df_model.drop(columns=['free_bikes', 'availability_class'])
y = df_model['availability_class']

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf_pipeline.fit(X_train, y_train)

from sklearn.metrics import classification_report
y_pred = clf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1845
           1       1.00      1.00      1.00      1058
           2       1.00      1.00      1.00      2260

    accuracy                           1.00      5163
   macro avg       1.00      1.00      1.00      5163
weighted avg       1.00      1.00      1.00      5163



In [26]:
df_model.dtypes


station                  object
name                     object
latitude_restaurant     float64
longitude_restaurant    float64
distance_m              float64
                         ...   
category_Venezuelan       uint8
category_Vietnamese       uint8
category_Wine Bars        uint8
category_Wok              uint8
availability_class        int64
Length: 117, dtype: object