In [126]:
"""
@author: maximefontana
"""
# System
import sys

# Graph Data
import snap
import networkx as nx

# Scientific Computing 
import numpy as np
import scipy
import pandas as pd
import csv

# Math
import random as ran
from time import time

# Custom imports
%run Algorithms.ipynb
%run ClusteringEvaluation.ipynb
%run PlotFunctions.ipynb
%run GroundTruthGen.ipynb
%run SBM.ipynb

# Specific models
from sklearn.cluster import SpectralClustering, KMeans
from sklearn import metrics

# Plotting
#import tensorflow as tf
from matplotlib import pyplot as plt

In [127]:
# Import dataset
df = pd.read_csv('datasets/High-School_data_2013.csv')

In [128]:
students_list = []
with open('datasets/metadata_2013.txt', newline = '') as students:                                                                                          
        student_reader = csv.reader(students, delimiter='\t')
        for student in student_reader:
            students_list.append(student)

In [129]:
pd.DataFrame(students_list).to_csv("datasets/students2013.csv")
data = pd.read_csv('datasets/students2013.csv')
data.shape

(329, 4)

In [130]:
# Clean the rows with 'Unknown' gender
data = data.drop(data[data['2'] == 'Unknown'].index)
data.shape # Verify new shape (7 rows removed)

(322, 4)

In [131]:
data.head()

Unnamed: 0.1,Unnamed: 0,0,1,2
0,0,650,2BIO1,F
1,1,498,2BIO1,F
2,2,627,2BIO1,F
3,3,857,2BIO1,F
4,4,487,2BIO1,F


In [132]:
# Add numerical column for gender classification (1 for Female, 0 for Male)
data['num_gender'] = np.select([(data['2'] == 'F'), (data['2'] == 'M')], [1,0])

In [133]:
data

Unnamed: 0.1,Unnamed: 0,0,1,2,num_gender
0,0,650,2BIO1,F,1
1,1,498,2BIO1,F,1
2,2,627,2BIO1,F,1
3,3,857,2BIO1,F,1
4,4,487,2BIO1,F,1
...,...,...,...,...,...
317,317,642,MP,M,0
318,318,245,MP,F,1
319,319,525,MP,F,1
327,327,205,2BIO3,M,0


In [134]:
# Sort by this newly created column
data.sort_values(by=['num_gender'])

Unnamed: 0.1,Unnamed: 0,0,1,2,num_gender
160,160,1238,MP*2,M,0
132,132,655,MP*1,M,0
133,133,149,MP*1,M,0
134,134,279,MP*1,M,0
135,135,290,MP*1,M,0
...,...,...,...,...,...
100,100,101,2BIO3,F,1
103,103,179,2BIO3,F,1
104,104,122,2BIO3,F,1
75,75,106,2BIO3,F,1


In [135]:
# Then retrieve the index column given that condition
data.sort_values(by=['num_gender'])['Unnamed: 0']

160    160
132    132
133    133
134    134
135    135
      ... 
100    100
103    103
104    104
75      75
328    328
Name: Unnamed: 0, Length: 322, dtype: int64