In [1]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import os
from sklearn.impute import KNNImputer
from shapely.geometry import Point, Polygon


Data cleaning

In [None]:
#load the data
data = pd.read_csv(r'Thesis data\combined.csv', dtype='unicode')

In [None]:
#drop unnecessary columns
data.drop(['notification', 'notificationId', 'application', 'battery', 'surveyId', 'startTime', 'endTime',
 'startTimeMillis', 'endTimeMillis', 'session', 'id', 'model', 'studyKey', 'data_version'], axis=1, inplace=True)


In [None]:
#check for missings
data[['longitude', 'latitude']].isna().sum()

In [None]:
#get description of the data
data[['longitude', 'latitude']].describe()

In [None]:
#clean the invalid values
import re
def clean_coordinates(value):
    #remove non-numeric characters
    cleaned_value = re.sub('[^0-9.-]', '', value)

    # Retain only the first dot
    dot_index = cleaned_value.find('.')
    cleaned_value = cleaned_value[:dot_index + 1] + cleaned_value[dot_index + 1:].replace('.', '')

    return cleaned_value


#assuming you have a DataFrame named 'data' with 'latitude' and 'longitude' columns
data['latitude'] = data['latitude'].apply(clean_coordinates)
data['longitude'] = data['longitude'].apply(clean_coordinates)

In [None]:
#convert the values to float
data['latitude'] = data['latitude'].astype(float)
data['longitude'] = data['longitude'].astype(float)

In [None]:
#visualize the data points

#plotting the GPS data points
plt.figure(figsize=(10, 6))  # Set the figure size as desired

#scatter plot of lon/lat data points
plt.scatter(data1['longitude'], data1['latitude'], s=10) 
plt.xlim(data['longitude'].min() - 0.1, data['longitude'].max() + 0.1)
plt.ylim(data['latitude'].min() - 0.1, data['latitude'].max() + 0.1)
# Set plot title and axis labels
plt.title('GPS Data Points')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

plt.show()  



In [None]:
#remove outliers 


lat_range = (-90, 90)
lon_range = (-180, 180)

#filter and replace out-of-range values with NaN
data.loc[data['latitude'].between(*lat_range), 'latitude'] = np.nan
data.loc[data['longitude'].between(*lon_range), 'longitude'] = np.nan

In [None]:
#cgeck for missings after the cleaning

data[['longitude', 'latitude']].isna().sum()

In [None]:
#visualize after the cleaning


#plotting the GPS data points
plt.figure(figsize=(10, 6))  # Set the figure size as desired

#scatter plot of lon/lat data points
plt.scatter(data1['longitude'], data1['latitude'], s=10)  
plt.xlim(data['longitude'].min() - 0.1, data['longitude'].max() + 0.1)
plt.ylim(data['latitude'].min() - 0.1, data['latitude'].max() + 0.1)
#set plot title and axis labels
plt.title('GPS Data Points')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

plt.show()

In [None]:
#visualize the locations on the map

from mpl_toolkits.basemap import Basemap

#create a new figure
fig = plt.figure(figsize=(12, 8))

#crreate a Basemap object
m = Basemap(projection='mill', llcrnrlat=-90, urcrnrlat=90, llcrnrlon=-180, urcrnrlon=180, resolution='c')

#draw coastlines, countries, and states
m.drawcoastlines()
m.drawcountries()
m.drawstates()

# plot latitude and longitude data
lon = data['longitude']
lat = data['latitude']
x, y = m(lon, lat)
m.scatter(x, y, marker='o', color='red', zorder=10, s=1)

# addd a title
plt.title('Latitude and Longitude Data')

plt.show()

KNN missing data imputation

In [None]:
#separate the lon/lat columns
lon_lat_data = data[['longitude', 'latitude']]

imputer = KNNImputer(n_neighbors=5)

imputed_data = pd.DataFrame(imputer.fit_transform(lon_lat_data), columns=['longitude', 'latitude'])

#update the original DataFrame with the imputed values
data['longitude'] = imputed_data['longitude']
data['latitude'] = imputed_data['latitude']


Create polygons for extracting campus location

In [None]:
#drop longitude and latitude duplicates
#copy on a new dataset

data1 = data.drop_duplicates(['longitude', 'latitude'], keep='first')

In [None]:
#create a list of tuples with the geographic points
campus_polygon = [(5.0427518, 51.5662584), (5.0409279, 51.562777), (5.0391469, 51.561203), (5.045198, 51.5599623), (5.0455842, 51.5619633), (5.0567637, 51.5612163), (5.0569139, 51.5624035), (5.0509272, 51.5628971), (5.0455628, 51.5631639), (5.046228, 51.5656582), (5.0428695, 51.5660852)]

In [None]:
#create polygon
polygon = Polygon(campus_polygon)


# Add a new column to store the point-in-polygon status
data_1['is_inside'] = False

In [None]:
#create boolean column indentifying wether location celongs to campus or not

for index, row in data_1.iterrows():
    point = Point(row['longitude'], row['latitude'])
    is_inside = point.within(polygon)
    data_1.at[index, 'is_inside'] = is_inside

In [None]:
#check how many locations are 'on campus'
print(data_1['is_inside'].value_counts())


In [None]:
#filter only on campus
#filter only the true values
data_true = data_1[data_1['is_inside']== True]

Add 'on campus' column on the whole dataset

In [None]:
#create boolean on the whole dataset

data['on_campus'] = False

In [None]:
#point that fall into the region on the polygon; whole dataset

for index, row in data.iterrows():
    point = Point(row['longitude'], row['latitude'])
    is_inside = point.within(polygon)
    data.at[index, 'on_campus'] = is_inside

In [None]:
#check the value counts
print(data['on_campus'].value_counts())

In [None]:
#save
data.to_csv(r'C:\Users\maria\Desktop\Final data\location features\on_campus_data.csv')

Add 'on campus' column to the app event dataset

In [None]:
app_events = pd.read_csv(r'Thesis data\combined.csv', dtype='unicode')

In [None]:
#set index to both of the datasets
app_events.set_index('StudentID', inplace=True)
data.set_index('StudentID', inplace=True)

In [None]:
#concatenate the two datasets
locations = pd.concat([app_events, data], axis=1 )

Feature engineering

1.Time features

In [None]:
#convert time to datetime
locations['startTime'] = pd.to_datetime(locations['startTime'])
locations['endTime'] = pd.to_datetime(locations['endTime'])

In [None]:
#create month name column

locations['month_name'] = pd.to_datetime(locations['startTime']).dt.month_name()

In [None]:
#create date column

locations['date'] = locations['startTime'].dt.day

In [None]:
#month column

locations['month'] = (locations['endTime']).dt.strftime('%Y-%m')

2. Time spent on campus

In [None]:
grouped = locations.groupby(['StudentID', 'month', 'date'])

def calculate_total_time_on_campus(group):
    on_campus_rows = group.loc[group['on_campus']]
    if len(on_campus_rows) > 0:
        total_time = (on_campus_rows['endTime'].iloc[-1] - on_campus_rows['startTime'].iloc[0]).total_seconds() / 60
    else:
        total_time = 0
    return total_time

result = grouped.apply(calculate_total_time_on_campus).reset_index(name='total_time_on_campus')

In [None]:
#create a dataframe with time on campus
time_campus = pd.DataFrame(result)

In [None]:
#total time in minutes spent on campus every month
sum_time_campus = time_campus.groupby(['StudentID', 'month'])['total_time_on_campus'].sum().reset_index(name= 'time_on_campus')

In [None]:
#save total time on campus
sum_time_campus.to_csv(r'C:\Users\maria\Desktop\Final data\new_datasets\university_time.csv')

3. Number of unibersity visits

In [None]:
# Group the data by 'StudentID', 'month', and 'date'
grouped = locations.groupby(['StudentID', 'month', 'date'])

# Calculate the number of unique days the campus was visited per month for each student
result = grouped['on_campus'].any().groupby(['StudentID', 'month']).sum().reset_index(name='days_visited_on_campus')

# Display the resulting dataframe
print(result)

In [None]:
# plot the months with most visits
sns.barplot(result, x='month', y='days_visited_on_campus')

In [None]:
import matplotlib.pyplot as plt

# Group the data by month and calculate the total number of visits
monthly_visits = result.groupby('month')['days_visited_on_campus'].sum()

# Sort the months based on the total number of visits in descending order
sorted_months = monthly_visits.sort_values(ascending=False)

# Plot the bar chart
plt.figure(figsize=(8, 6))
sorted_months.plot(kind='bar', color='blue')
plt.xlabel('Month')
plt.ylabel('Number of Visits')
plt.title('Number of Visits per Month')
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
#save uni visits
result.to_csv(r'new_datasets\university_visits.csv')