In [1]:
#!pip install --upgrade google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client

# Capstone - Push data Chunk
### 2nd Survey **(Student Service Awareness)**

## Importing & Authenticating

In [2]:
#Authenticating google colab
import google.colab
google.colab.auth.authenticate_user()

In [3]:
from google.colab import files

# upload CSV file from local machine
uploaded = files.upload()

Saving Student_Survey_2.csv to Student_Survey_2.csv


## Creating a DataFrame

In [4]:
#Creating a DataFrame Object - df2
import io
import pandas as pd
df2 = pd.read_csv(io.BytesIO(uploaded['Student_Survey_2.csv']))

In [5]:
df2.columns

Index(['Timestamp', 'Username',
       'In which cohort (Intake) did you start the DAB program?',
       'Which semester are you currently enrolled in?',
       'Are the events and services of the college effectively communicated?',
       'Which of the following services provided by the SRC/College are you interested in exploring or hearing more about?',
       'According to you, are there sufficient transportation service options available between Downtown and South Campus during events?',
       'In your opinion, would it be beneficial for the students to have additional events in Downtown, such as job fairs, cultural events, games, and career services?',
       'Do you have any suggestions or feedback regarding the services and events offered by the college?',
       'Is the duration of breaks between lectures appropriate?',
       'How would you rate the study areas currently available in or near the Downtown campus, on a scale of 1 to 5?',
       'Would you like to be in the same

In [6]:
#Renaming it columns to a particular format
df2 = df2.rename(columns={
    'In which cohort (Intake) did you start the DAB program?': 'Intake',
    'Which semester are you currently enrolled in?': 'Semester',
    'Are the events and services of the college effectively communicated?': 'Services_comm',
    'Which of the following services provided by the SRC/College are you interested in exploring or hearing more about?': 'Services_list',
    'According to you, are there sufficient transportation service options available between Downtown and South Campus during events?': 'Commute',
    'In your opinion, would it be beneficial for the students to have additional events in Downtown, such as job fairs, cultural events, games, and career services?': 'Additional_events',
    'Do you have any suggestions or feedback regarding the services and events offered by the college?': 'Events_suggestions',
    'Is the duration of breaks between lectures appropriate?': 'Breaks_btwn_Lecture',
    'How would you rate the study areas currently available in or near the Downtown campus, on a scale of 1 to 5?': 'Study_area_rating',
    'Would you like to be in the same section throughout the duration of the program?': 'Same_section',
})



## Data Cleaning

In [7]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Timestamp            126 non-null    object
 1   Username             1 non-null      object
 2   Intake               126 non-null    object
 3   Semester             126 non-null    object
 4   Services_comm        126 non-null    object
 5   Services_list        126 non-null    object
 6   Commute              126 non-null    object
 7   Additional_events    126 non-null    object
 8   Events_suggestions   36 non-null     object
 9   Breaks_btwn_Lecture  126 non-null    object
 10  Study_area_rating    126 non-null    object
 11  Same_section         126 non-null    object
 12  Unnamed: 12          1 non-null      object
 13  Unnamed: 13          1 non-null      object
 14  Unnamed: 14          1 non-null      object
dtypes: object(15)
memory usage: 14.9+ KB


In [8]:
#Datatypes are corrected and the unwanted columns are dropped
df2=df2.drop(['Timestamp'],axis=1)
df2=df2.drop(['Username'],axis=1)
df2=df2.drop(['Unnamed: 12'],axis=1)
df2=df2.drop(['Unnamed: 13'],axis=1)
df2=df2.drop(['Unnamed: 14'],axis=1)


In [9]:
#Checking Null Values
df2.isnull().sum()

Intake                  0
Semester                0
Services_comm           0
Services_list           0
Commute                 0
Additional_events       0
Events_suggestions     90
Breaks_btwn_Lecture     0
Study_area_rating       0
Same_section            0
dtype: int64

In [10]:
# Filling all the missing values in below column with 'None'
df2['Events_suggestions']=df2['Events_suggestions'].fillna('None')

In [11]:
df2

Unnamed: 0,Intake,Semester,Services_comm,Services_list,Commute,Additional_events,Events_suggestions,Breaks_btwn_Lecture,Study_area_rating,Same_section
0,Winter 2022,4th Semester,No,Commute/Transportation,No,Yes,,No,Yes,No
1,Winter 2022,4th Semester,No,Career Services;Food Bank;SRC Service Office,No,Yes,,No,2,No
2,Winter 2023,1st Semester,No,Housing;Career Services;Food Bank;Commute/Tran...,No,Yes,,Yes,3,Yes
3,Winter 2022,4th Semester,No,Housing;Career Services;Commute/Transportation...,Yes,Yes,Food Services should be more flexible,Yes,4,No
4,Fall 2022,2nd Semester,Yes,Career Services;Food Bank;Tax Assistance;Safe ...,Yes,Yes,Plz organise most of the events in downtown,Yes,2,Yes
...,...,...,...,...,...,...,...,...,...,...
121,Winter 2023,1st Semester,No,Housing;Career Services;Printing Services,Yes,No,,Yes,3,Yes
122,Fall 2022,2nd Semester,Yes,Printing Services;Parking Services;Health and ...,Yes,Yes,,Yes,3,Yes
123,Spring 2022,3rd Semester,No,Housing;Food Bank;Tax Assistance;Safe walk,Yes,Yes,,Yes,4,No
124,Winter 2022,4th Semester,Yes,Printing Services;Parking Services;Health and ...,Yes,No,,Yes,2,Yes


In [12]:
one_hot_encoded = df2['Services_list'].str.get_dummies(sep=';')
df2 = pd.concat([df2, one_hot_encoded], axis=1)
df2= df2.drop("Services_list",axis=1)
df2.rename(columns={'Commute/Transportation': 'Commute_Transportation'}, inplace=True)

In [13]:
df2

Unnamed: 0,Intake,Semester,Services_comm,Commute,Additional_events,Events_suggestions,Breaks_btwn_Lecture,Study_area_rating,Same_section,Career Services,Commute_Transportation,Food Bank,Health and Nursing,Housing,Parking Services,Printing Services,SRC Service Office,Safe walk,Tax Assistance
0,Winter 2022,4th Semester,No,No,Yes,,No,Yes,No,0,1,0,0,0,0,0,0,0,0
1,Winter 2022,4th Semester,No,No,Yes,,No,2,No,1,0,1,0,0,0,0,1,0,0
2,Winter 2023,1st Semester,No,No,Yes,,Yes,3,Yes,1,1,1,0,1,0,0,0,0,0
3,Winter 2022,4th Semester,No,Yes,Yes,Food Services should be more flexible,Yes,4,No,1,1,0,0,1,0,1,0,0,1
4,Fall 2022,2nd Semester,Yes,Yes,Yes,Plz organise most of the events in downtown,Yes,2,Yes,1,0,1,1,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,Winter 2023,1st Semester,No,Yes,No,,Yes,3,Yes,1,0,0,0,1,0,1,0,0,0
122,Fall 2022,2nd Semester,Yes,Yes,Yes,,Yes,3,Yes,0,0,0,1,0,1,1,0,0,0
123,Spring 2022,3rd Semester,No,Yes,Yes,,Yes,4,No,0,0,1,0,1,0,0,0,1,1
124,Winter 2022,4th Semester,Yes,Yes,No,,Yes,2,Yes,0,0,0,1,0,1,1,0,0,0


In [14]:
new_cols = []
for col in df2.columns:
    new_cols.append(col.replace(' ', '_'))
df2 = df2.rename(columns=dict(zip(df2.columns, new_cols)))
df2

Unnamed: 0,Intake,Semester,Services_comm,Commute,Additional_events,Events_suggestions,Breaks_btwn_Lecture,Study_area_rating,Same_section,Career_Services,Commute_Transportation,Food_Bank,Health_and_Nursing,Housing,Parking_Services,Printing_Services,SRC_Service_Office,Safe_walk,Tax_Assistance
0,Winter 2022,4th Semester,No,No,Yes,,No,Yes,No,0,1,0,0,0,0,0,0,0,0
1,Winter 2022,4th Semester,No,No,Yes,,No,2,No,1,0,1,0,0,0,0,1,0,0
2,Winter 2023,1st Semester,No,No,Yes,,Yes,3,Yes,1,1,1,0,1,0,0,0,0,0
3,Winter 2022,4th Semester,No,Yes,Yes,Food Services should be more flexible,Yes,4,No,1,1,0,0,1,0,1,0,0,1
4,Fall 2022,2nd Semester,Yes,Yes,Yes,Plz organise most of the events in downtown,Yes,2,Yes,1,0,1,1,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,Winter 2023,1st Semester,No,Yes,No,,Yes,3,Yes,1,0,0,0,1,0,1,0,0,0
122,Fall 2022,2nd Semester,Yes,Yes,Yes,,Yes,3,Yes,0,0,0,1,0,1,1,0,0,0
123,Spring 2022,3rd Semester,No,Yes,Yes,,Yes,4,No,0,0,1,0,1,0,0,0,1,1
124,Winter 2022,4th Semester,Yes,Yes,No,,Yes,2,Yes,0,0,0,1,0,1,1,0,0,0


## Big-Query Table
### API Connection to Google BigQuery Database (surveyproject-378222.Capstone_Project)



In [15]:
#Connecting Big-Query Project 
from google.cloud import bigquery
client = bigquery.Client(project='surveyproject-378222')
sc = """
DROP TABLE IF EXISTS `surveyproject-378222.Capstone_Project.Survey2_Base_Table`;
"""
query_job = client.query(sc)
table_id = 'surveyproject-378222.Capstone_Project.Survey2_Base_Table'

# Defining the schema for the new table
schema = [
    bigquery.SchemaField('Intake', 'STRING', mode='REQUIRED'),
    bigquery.SchemaField('Semester', 'STRING', mode='REQUIRED'),
    bigquery.SchemaField('Services_comm', 'STRING'),
    bigquery.SchemaField('Services_list', 'STRING'),
    bigquery.SchemaField('Commute', 'STRING'),
    bigquery.SchemaField('Additional_events', 'STRING'),
    bigquery.SchemaField('Events_suggestions', 'STRING'),
    bigquery.SchemaField('Breaks_btwn_Lecture', 'BOOLEAN'),
    bigquery.SchemaField('Study_area_rating', 'STRING'),
    bigquery.SchemaField('Same_section', 'STRING'),
    bigquery.SchemaField('Career Services', 'STRING'),
    bigquery.SchemaField('Commute_Transportation', 'STRING'),
    bigquery.SchemaField('Food Bank', 'STRING'),
    bigquery.SchemaField('Health and Nursing', 'STRING'),
    bigquery.SchemaField('Housing', 'STRING'),
    bigquery.SchemaField('Parking Services', 'STRING'),
    bigquery.SchemaField('Printing Services', 'STRING'),
    bigquery.SchemaField('SRC Service Office', 'STRING'),
    bigquery.SchemaField('Safe walk', 'STRING'),
    bigquery.SchemaField('Tax Assistance', 'STRING')

]

try:
    # Try to get the existing table
    table = client.get_table(table_id)
    print(f'Table {table.table_id} already exists in BigQuery.')
except google.api_core.exceptions.NotFound:
    # If the table does not exist, create it
    table = bigquery.Table(table_id, schema=schema)
    table = client.create_table(table)
    print(f'Table {table.table_id} created in BigQuery.')


project_id = 'surveyproject-378222'

client = bigquery.Client(project=project_id)
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.CSV
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
job = client.load_table_from_dataframe(df2, table_id, job_config=job_config)
job.result()



Table Survey2_Base_Table created in BigQuery.


LoadJob<project=surveyproject-378222, location=northamerica-northeast2, id=7216a086-b897-44ed-a784-144df85f5b74>

In [16]:
#Reading data from BigQuery to view the push data
import pandas_gbq

Survey2_Base_Table= pandas_gbq.read_gbq(f'SELECT * FROM `surveyproject-378222.Capstone_Project.Survey2_Base_Table`',project_id='surveyproject-378222', dialect='standard') 
Survey2_Base_Table.describe()

Downloading: 100%|[32m██████████[0m|


Unnamed: 0,Career_Services,Commute_Transportation,Food_Bank,Health_and_Nursing,Housing,Parking_Services,Printing_Services,SRC_Service_Office,Safe_walk,Tax_Assistance
count,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0
mean,0.603175,0.246032,0.460317,0.325397,0.325397,0.238095,0.285714,0.301587,0.15873,0.436508
std,0.491192,0.432417,0.500413,0.470393,0.470393,0.427618,0.453557,0.460779,0.366883,0.497932
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
Survey2_Base_Table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Intake                  126 non-null    object
 1   Semester                126 non-null    object
 2   Services_comm           126 non-null    object
 3   Commute                 126 non-null    object
 4   Additional_events       126 non-null    object
 5   Events_suggestions      126 non-null    object
 6   Breaks_btwn_Lecture     126 non-null    object
 7   Study_area_rating       126 non-null    object
 8   Same_section            126 non-null    object
 9   Career_Services         126 non-null    Int64 
 10  Commute_Transportation  126 non-null    Int64 
 11  Food_Bank               126 non-null    Int64 
 12  Health_and_Nursing      126 non-null    Int64 
 13  Housing                 126 non-null    Int64 
 14  Parking_Services        126 non-null    Int64 
 15  Printi

## Breaking Surevy2_Base_Table to 2 other Tables

In [18]:
 #Defining the SQL code to create tables and insert data
sql_code = """
DROP TABLE IF EXISTS `Capstone_Project.Survey_2`;
CREATE TABLE `Capstone_Project.Survey_2` (
  intake STRING,
  semester STRING,
  Services_comm STRING,
  Commute STRING,
  Additional_events STRING,
  Events_suggestions STRING,
  Breaks_btwn_Lecture STRING,
  Study_area_rating STRING,
  Same_section  STRING                 
);

INSERT INTO `surveyproject-378222.Capstone_Project.Survey_2`
SELECT intake, semester, Services_comm, Commute, Additional_events, 
       Events_suggestions, Breaks_btwn_Lecture, Study_area_rating, Same_section
FROM `surveyproject-378222.Capstone_Project.Survey2_Base_Table`;

DROP TABLE IF EXISTS `Capstone_Project.Survey2_services`;
CREATE TABLE `Capstone_Project.Survey2_services` (
  Career_Services INTEGER,
  Commute_Transportation INTEGER,
  Food_Bank INTEGER,
  Health_and_Nursing INTEGER,
  Housing INTEGER,
  Parking_Services INTEGER,
  Printing_Services INTEGER,
  SRC_Service_Office INTEGER,
  Safe_walk INTEGER,
  Tax_Assistance  INTEGER        
);

INSERT INTO `surveyproject-378222.Capstone_Project.Survey2_services`
SELECT Career_Services, Commute_Transportation, Food_Bank,
       Health_and_Nursing, Housing, Parking_Services,
       Printing_Services, SRC_Service_Office, Safe_walk,
       Tax_Assistance       
FROM `surveyproject-378222.Capstone_Project.Survey2_Base_Table`;
"""
query_job = client.query(sql_code)
query_job.result()


<google.cloud.bigquery.table._EmptyRowIterator at 0x7f16fe771ca0>

In [19]:
Survey_2= pandas_gbq.read_gbq(f'SELECT * FROM `surveyproject-378222.Capstone_Project.Survey_2`',project_id='surveyproject-378222', dialect='standard')
Survey_2

Downloading: 100%|[32m██████████[0m|


Unnamed: 0,intake,semester,Services_comm,Commute,Additional_events,Events_suggestions,Breaks_btwn_Lecture,Study_area_rating,Same_section
0,Fall 2021,4th Semester,No,No,Yes,COLLEGE SHOULD PROVIDE SOME MORE TIME IN THE ...,Yes,1,Yes
1,Fall 2021,4th Semester,Yes,No,Yes,It would be great to have job fairs in downtow...,No,4,Yes
2,Fall 2021,4th Semester,No,No,Yes,There is a desperate need of career focused ev...,Yes,1,No
3,Fall 2021,4th Semester,No,No,Yes,Most of the services like printing are unneces...,No,3,Yes
4,Fall 2021,4th Semester,Yes,No,No,,No,1,Yes
...,...,...,...,...,...,...,...,...,...
121,Winter 2023,1st Semester,Yes,Yes,Yes,It would be great to have more career and netw...,Yes,4,Yes
122,Winter 2023,1st Semester,Yes,Yes,Yes,,No,4,Yes
123,Winter 2023,1st Semester,No,Yes,No,,No,1,Yes
124,Winter 2023,1st Semester,No,Yes,Yes,Need canteen,Yes,2,Yes


In [20]:
Survey2_services= pandas_gbq.read_gbq(f'SELECT * FROM `surveyproject-378222.Capstone_Project.Survey2_services`',project_id='surveyproject-378222', dialect='standard')
Survey2_services

Downloading: 100%|[32m██████████[0m|


Unnamed: 0,Career_Services,Commute_Transportation,Food_Bank,Health_and_Nursing,Housing,Parking_Services,Printing_Services,SRC_Service_Office,Safe_walk,Tax_Assistance
0,0,0,1,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,1,0,0,0,1,1
3,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
121,1,0,1,1,0,0,0,0,0,1
122,1,0,0,1,0,1,0,1,0,0
123,1,0,0,1,0,0,0,0,0,1
124,1,0,0,1,0,0,1,0,0,0
