In [9]:
#!pip install --upgrade google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client

# Capstone - Push data Chunk
### 2nd Survey **(Student Service Awareness)**

## Importing & Authenticating

In [10]:
#Authenticating google colab
import google.colab
google.colab.auth.authenticate_user()

In [11]:
from google.colab import files

# upload CSV file from local machine
uploaded = files.upload()

Saving Student_Survey_2.csv to Student_Survey_2 (1).csv


## Creating a DataFrame

In [12]:
#Creating a DataFrame Object - df2
import io
import pandas as pd
df2 = pd.read_csv(io.BytesIO(uploaded['Student_Survey_2.csv']))

In [13]:
df2.columns

Index(['Timestamp', 'Username',
       'In which cohort (Intake) did you start the DAB program?',
       'Which semester are you currently enrolled in?',
       'Are the events and services of the college effectively communicated?',
       'Which of the following services provided by the SRC/College are you interested in exploring or hearing more about?',
       'According to you, are there sufficient transportation service options available between Downtown and South Campus during events?',
       'In your opinion, would it be beneficial for the students to have additional events in Downtown, such as job fairs, cultural events, games, and career services?',
       'Do you have any suggestions or feedback regarding the services and events offered by the college?',
       'Is the duration of breaks between lectures appropriate?',
       'How would you rate the study areas currently available in or near the Downtown campus, on a scale of 1 to 5?',
       'Would you like to be in the same

In [14]:
#Renaming it columns to a particular format
df2 = df2.rename(columns={
    'In which cohort (Intake) did you start the DAB program?': 'Intake',
    'Which semester are you currently enrolled in?': 'Semester',
    'Are the events and services of the college effectively communicated?': 'Services_comm',
    'Which of the following services provided by the SRC/College are you interested in exploring or hearing more about?': 'Services_list',
    'According to you, are there sufficient transportation service options available between Downtown and South Campus during events?': 'Commute',
    'In your opinion, would it be beneficial for the students to have additional events in Downtown, such as job fairs, cultural events, games, and career services?': 'Additional_events',
    'Do you have any suggestions or feedback regarding the services and events offered by the college?': 'Events_suggestions',
    'Is the duration of breaks between lectures appropriate?': 'Breaks_btwn_Lecture',
    'How would you rate the study areas currently available in or near the Downtown campus, on a scale of 1 to 5?': 'Study_area_rating',
    'Would you like to be in the same section throughout the duration of the program?': 'Same_section',
})



## Data Cleaning

In [15]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Timestamp            125 non-null    object
 1   Username             1 non-null      object
 2   Intake               125 non-null    object
 3   Semester             125 non-null    object
 4   Services_comm        125 non-null    object
 5   Services_list        125 non-null    object
 6   Commute              125 non-null    object
 7   Additional_events    125 non-null    object
 8   Events_suggestions   35 non-null     object
 9   Breaks_btwn_Lecture  125 non-null    object
 10  Study_area_rating    125 non-null    object
 11  Same_section         125 non-null    object
 12  Unnamed: 12          1 non-null      object
 13  Unnamed: 13          1 non-null      object
 14  Unnamed: 14          1 non-null      object
dtypes: object(15)
memory usage: 14.8+ KB


In [16]:
#Datatypes are corrected and the unwanted columns are dropped
df2=df2.drop(['Timestamp'],axis=1)
df2=df2.drop(['Username'],axis=1)
df2=df2.drop(['Unnamed: 12'],axis=1)
df2=df2.drop(['Unnamed: 13'],axis=1)
df2=df2.drop(['Unnamed: 14'],axis=1)


In [17]:
#Checking Null Values
df2.isnull().sum()

Intake                  0
Semester                0
Services_comm           0
Services_list           0
Commute                 0
Additional_events       0
Events_suggestions     90
Breaks_btwn_Lecture     0
Study_area_rating       0
Same_section            0
dtype: int64

In [23]:
# Filling all the missing values in below column with 'None'
df2['Events_suggestions']=df2['Events_suggestions'].fillna('None')

## Big-Query Table
### API Connection to Google BigQuery Database (surveyproject-378222.Capstone_Project)



In [26]:
#Connecting Big-Query Project 
from google.cloud import bigquery
client = bigquery.Client(project='surveyproject-378222')
sql_code = """
DROP TABLE IF EXISTS `surveyproject-378222.Capstone_Project.Surevy2_Base_Table`;
"""
table_id = 'surveyproject-378222.Capstone_Project.Surevy2_Base_Table'

# Defining the schema for the new table
schema = [
    bigquery.SchemaField('Intake', 'STRING', mode='REQUIRED'),
    bigquery.SchemaField('Semester', 'STRING', mode='REQUIRED'),
    bigquery.SchemaField('Services_comm', 'STRING'),
    bigquery.SchemaField('Services_list', 'STRING'),
    bigquery.SchemaField('Commute', 'STRING'),
    bigquery.SchemaField('Additional_events', 'STRING'),
    bigquery.SchemaField('Events_suggestions', 'STRING'),
    bigquery.SchemaField('Breaks_btwn_Lecture', 'BOOLEAN'),
    bigquery.SchemaField('Study_area_rating', 'STRING'),
    bigquery.SchemaField('Same_section', 'STRING'),
    bigquery.SchemaField('Career Services', 'STRING'),
    bigquery.SchemaField('Commute/Transportation', 'STRING'),
    bigquery.SchemaField('Food Bank', 'STRING'),
    bigquery.SchemaField('Health and Nursing', 'STRING'),
    bigquery.SchemaField('Housing', 'STRING'),
    bigquery.SchemaField('Parking Services', 'STRING'),
    bigquery.SchemaField('Printing Services', 'STRING'),
    bigquery.SchemaField('SRC Service Office', 'STRING'),
    bigquery.SchemaField('Safe walk', 'STRING'),
    bigquery.SchemaField('Tax Assistance', 'STRING')

]

try:
    # Try to get the existing table
    table = client.get_table(table_id)
    print(f'Table {table.table_id} already exists in BigQuery.')
except google.api_core.exceptions.NotFound:
    # If the table does not exist, create it
    table = bigquery.Table(table_id, schema=schema)
    table = client.create_table(table)
    print(f'Table {table.table_id} created in BigQuery.')


project_id = 'surveyproject-378222'

client = bigquery.Client(project=project_id)
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.CSV
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
job = client.load_table_from_dataframe(df2, table_id, job_config=job_config)
job.result()



Table Surevy2_Base_Table already exists in BigQuery.


LoadJob<project=surveyproject-378222, location=northamerica-northeast2, id=6aac3e67-c0ef-4d20-b9a1-759f0c7b9275>

In [25]:
#Reading data from BigQuery to view the push data
import pandas_gbq

sb2= pandas_gbq.read_gbq(f'SELECT * FROM `surveyproject-378222.Capstone_Project.Surevy2_Base_Table`',project_id='surveyproject-378222', dialect='standard') 
sb2.describe()

Downloading: 100%|[32m██████████[0m|


Unnamed: 0,Intake,Semester,Services_comm,Services_list,Commute,Additional_events,Events_suggestions,Breaks_btwn_Lecture,Study_area_rating,Same_section
count,125,125,125,125,125,125,125.0,125,125,125
unique,5,4,2,90,2,2,30.0,2,6,2
top,Winter 2023,1st Semester,Yes,Career Services,Yes,Yes,,Yes,4,Yes
freq,49,50,86,6,67,108,93.0,83,36,82
