In [1]:
import numpy as np
import pandas as pd
import os
from collections import defaultdict
import datetime
import pytz
import matplotlib.pyplot as plt


class project_data():
    '''
    the class generate the features might be used
    the root path of input and output is needed
    all function in the class return dictionary(s)
    '''
    def __init__(self, root_path):
        self.root_path = root_path
        self.pre_student = ['u00', 'u01', 'u02', 'u03', 'u04', 'u05', 'u07', 'u08', 'u09', 'u10', 'u12', 'u13', 'u14',
                            'u15', 'u16', 'u17', 'u18', 'u19', 'u20', 'u22', 'u23', 'u24', 'u27', 'u30', 'u31', 'u32',
                            'u33', 'u34', 'u35', 'u36', 'u39', 'u42', 'u43', 'u44', 'u45', 'u46', 'u47', 'u49', 'u50',
                            'u51', 'u52', 'u53', 'u56', 'u57', 'u58', 'u59']
        self.post_student = ['u00', 'u01', 'u02', 'u03', 'u04', 'u05', 'u07', 'u09', 'u10', 'u14', 'u15', 'u16', 'u17',
                             'u19', 'u20', 'u23', 'u24', 'u27', 'u30', 'u31', 'u32', 'u33', 'u34', 'u35', 'u36', 'u42',
                             'u43', 'u44', 'u45', 'u46', 'u47', 'u49', 'u51', 'u52', 'u53', 'u56', 'u59']
        self.tz = pytz.timezone('America/New_York')  # the time zone of Dartmouth College
        self.duration = 6_048_000  # the total seconds of 10 weeks

    def FlourishingScale(self):
        '''
        the flourishing scale, as well as the panas scale are miss some values.
        for the fairness and calculation, we add a median value if there is a nan since the missing value may suggest
        that it's hard to say it negative or positive. but it's unappropriate to add zero.
        :returns: dictionary key: student id, value: sum of all choice
        '''
        flour_path = os.path.join(self.root_path, 'Outputs/FlourishingScale.csv')
        flour_file = np.array(pd.read_csv(flour_path))
        pre_flour = defaultdict(int)
        post_flour = defaultdict(int)
        for r in flour_file:
            if r[1] == 'pre':
                for c in range(2, 10):
                    if np.isnan(float(r[c])):  # one of several ways to detect nan
                        pre_flour[r[0]] += 4  # the choice is 1-7 so the median value we use is 4
                    else:
                        pre_flour[r[0]] += float(r[c])
            elif r[1] == 'post':
                for c in range(2, 10):
                    if np.isnan(float(r[c])):
                        pre_flour[r[0]] += 4
                    else:
                        post_flour[r[0]] += float(r[c])
        return pre_flour, post_flour

    def PANAS(self):
        '''
        similar above
        :return: 4 dictionaries
        '''
        panas_path = os.path.join(self.root_path, 'Outputs/panas.csv')
        panas_file = np.array(pd.read_csv(panas_path))
        pre_pos_panas = defaultdict(int)
        pre_neg_panas = defaultdict(int)
        post_pos_panas = defaultdict(int)
        post_neg_panas = defaultdict(int)
        positive_choices = [2, 5, 9, 10, 12, 13, 15, 16, 18]  # the positive and negatives value are mixed and are different from the original one
        for r in panas_file:
            if r[1] == 'pre':
                for c in range(2, 20):
                    if c in positive_choices:
                        if np.isnan(float(r[c])):
                            pre_pos_panas[r[0]] += 3  # the choice of panas is 1-5 so the median value is 3
                        else:
                            pre_pos_panas[r[0]] += float(r[c])
                    else:
                        if np.isnan(float(r[c])):
                            pre_neg_panas[r[0]] += 3
                        else:
                            pre_neg_panas[r[0]] += float(r[c])
            elif r[1] == 'post':
                for c in range(2, 20):
                    if c in positive_choices:
                        if np.isnan(float(r[c])):
                            post_pos_panas[r[0]] += 3
                        else:
                            post_pos_panas[r[0]] += float(r[c])
                    else:
                        if np.isnan(float(r[c])):
                            post_neg_panas[r[0]] += 3
                        else:
                            post_neg_panas[r[0]] += float(r[c])
        return pre_pos_panas, pre_neg_panas, post_pos_panas, post_neg_panas

    def conversation_freq(self):
        '''
        as the reference required, the data is divided into three time period;
        day: 9am - 6pm
        evening: 6pm - 12pm
        night: 12pm - 9am
        in the data, some sensing data are appearntly short than the other, just keep it
        '''
        conv_freq_day = defaultdict(float)
        conv_freq_eve = defaultdict(float)
        conv_freq_nig = defaultdict(float)
        csv_dir = os.path.join(self.root_path, 'Inputs/sensing/conversation')
        for s in self.pre_student:
            file_path = os.path.join(csv_dir, 'conversation_' + s + '.csv')
            csv_file = np.array(pd.read_csv(file_path))
            for r in csv_file:
                time = datetime.datetime.fromtimestamp(r[0], tz=self.tz) # change to the time zone tz=
                if 9 <= time.hour < 18:
                    conv_freq_day[s] += 1
                elif time.hour >= 18:
                    conv_freq_eve[s] += 1
                else:
                    conv_freq_nig[s] += 1
        return conv_freq_day, conv_freq_eve, conv_freq_nig

    def conversation_dura(self):
        '''
        the each conversation belong to the time period of the start timestamp
        day: 9am - 6pm
        evening: 6pm - 12pm
        night: 12pm - 9am
        '''
        conv_dura_day = defaultdict(float)
        conv_dura_eve = defaultdict(float)
        conv_dura_nig = defaultdict(float)
        csv_dir = os.path.join(self.root_path, 'Inputs/sensing/conversation')
        for s in self.pre_student:
            file_path = os.path.join(csv_dir, 'conversation_' + s + '.csv')
            csv_file = np.array(pd.read_csv(file_path))
            for r in csv_file:
                start_time = datetime.datetime.fromtimestamp(r[0], tz=self.tz)  # change to the time zone tz=
                end_time = datetime.datetime.fromtimestamp(r[1], tz=self.tz)
                time = (end_time - start_time).seconds
                if 9 <= start_time.hour < 18:
                    conv_dura_day[s] += time
                elif start_time.hour >= 18:
                    conv_dura_eve[s] += time
                else:
                    conv_dura_nig[s] += time
        return conv_dura_day, conv_dura_eve, conv_dura_nig

    def co_location(self):
        '''
        the bluetooth is used to calculate co-location
        some data appeared less than 10 times should be removed
        '''
        csv_dir = os.path.join(self.root_path, 'Inputs/sensing/bluetooth')
        co_location = defaultdict(float)
        temp_list = []  # used to save the dictionaries
        for s in self.pre_student:
            file_path = os.path.join(csv_dir, 'bt_' + s + '.csv')
            csv_file = np.array(pd.read_csv(file_path))
            temp_dict = defaultdict(int)  # used to save how many times the bluetooth mac has meet
            for r in csv_file:
                temp_dict[r[1]] += 1
            temp_list.append(temp_dict)
        for s, dic in zip(self.pre_student, temp_list):
            file_path = os.path.join(csv_dir, 'bt_' + s + '.csv')
            csv_file = np.array(pd.read_csv(file_path))
            for r in csv_file:
                if dic[r[1]] > 10:
                    co_location[s] += 1
        return co_location

    def activity(self):
        '''
        the csv files are of different length, so the result will divide by the length of 0 value
        '''
        activity_day = defaultdict(float)
        activity_eve = defaultdict(float)
        activity_nig = defaultdict(float)
        csv_dir = os.path.join(self.root_path, 'Inputs/sensing/activity')
        for s in self.pre_student:
            file_path = os.path.join(csv_dir, 'activity_' + s + '.csv')
            csv_file = np.array(pd.read_csv(file_path))
            zeros = 0
            for r in csv_file:
                if r[1] == 0:
                    zeros += 1
                    continue
                time = datetime.datetime.fromtimestamp(r[0], tz=self.tz)
                if 9 <= time.hour < 18:
                    activity_day[s] += 1
                elif time.hour >= 18:
                    activity_eve[s] += 1
                else:
                    activity_nig[s] += 1
            activity_day[s] /= zeros
            activity_eve[s] /= zeros
            activity_nig[s] /= zeros
        return activity_day, activity_eve, activity_nig

    def traveled_distance(self):
        '''
        use gps value to calculate traveled distance
        the change of  latitude and longitude represent the distance
        block distance is used
        '''
        distance_day = defaultdict(int)
        distance_eve = defaultdict(int)
        distance_nig = defaultdict(int)
        csv_dir = os.path.join(self.root_path, 'Inputs/sensing/gps')
        for s in self.pre_student:
            file_path = os.path.join(csv_dir, 'gps_' + s + '.csv')
            csv_file = np.array(pd.read_csv(file_path, index_col=False))
            x, y = 0, 0  # last
            for r in csv_file:
                if x == 0 and y == 0:
                    x, y = r[4], r[5]
                    continue
                time = datetime.datetime.fromtimestamp(r[0], tz=self.tz)
                if 9 <= time.hour < 18:
                    distance_day[s] += self.block_distance(x, y, r[4], r[5])
                elif time.hour >= 18:
                    distance_eve[s] += self.block_distance(x, y, r[4], r[5])
                else:
                    distance_nig[s] += self.block_distance(x, y, r[4], r[5])
                x, y = r[4], r[5]  # record last gps
        return distance_day, distance_eve, distance_nig

    def block_distance(self, x1, y1, x2, y2):
        # block distace
        return abs(x1 - x2) + abs(y1 - y2)

    def indoor_mobility(self):
        '''
        wifi scan logs indicate indoor mobility
        '''
        indoor_mobility_day = defaultdict(int)
        indoor_mobility_eve = defaultdict(int)
        indoor_mobility_nig = defaultdict(int)
        wifi_dir = os.path.join(self.root_path, 'Inputs/sensing/wifi')
        for s in self.pre_student:
            wifi_path = os.path.join(wifi_dir, 'wifi_' + s + '.csv')
            wifi_file = np.array(pd.read_csv(wifi_path, index_col=False))
            last_timestamp = 0  # wifi may detect several wifi at same time
            for r in wifi_file:
                if r[0] == last_timestamp:
                    continue
                last_timestamp = r[0]
                time = datetime.datetime.fromtimestamp(r[0], tz=self.tz)
                if 9 <= time.hour < 18:
                    indoor_mobility_day[s] += 1
                elif time.hour >= 18:
                    indoor_mobility_eve[s] += 1
                else:
                    indoor_mobility_nig[s] += 1
        return indoor_mobility_day, indoor_mobility_eve, indoor_mobility_nig

    def sleep_duration(self):
        sleep = defaultdict(list)
        activity_dir = os.path.join(self.root_path, 'Inputs/sensing/activity')
        audio_dir = os.path.join(self.root_path, 'Inputs/sensing/audio')
        phonecharge_dir = os.path.join(self.root_path, 'Inputs/sensing/phonecharge')
        phonelock_dir = os.path.join(self.root_path, 'Inputs/sensing/phonelock')
        for s in self.pre_student:
            activity_path = os.path.join(activity_dir, 'activity_' + s + '.csv')
            activity_file = np.array(pd.read_csv(activity_path, index_col=False))
            audio_path = os.path.join(audio_dir, 'audio_' + s + '.csv')
            audio_file = np.array(pd.read_csv(audio_path, index_col=False))
            phonecharge_path = os.path.join(phonecharge_dir, 'phonecharge_' + s + '.csv')
            phonecharge_file = np.array(pd.read_csv(phonecharge_path, index_col=False))
            phonelock_path = os.path.join(phonelock_dir, 'phonelock_' + s + '.csv')
            phonelock_file = np.array(pd.read_csv(phonelock_path, index_col=False))
            start = min(activity_file[0][0], audio_file[0][0], phonelock_file[0][0], phonecharge_file[0][0])
            end = max(activity_file[-1][0], audio_file[-1][0], phonelock_file[-1][0], phonecharge_file[-1][0])
            temp = np.ones((end - start + 3, 4), dtype=np.float32)
            temp[:, 0] = temp[:, 0] * 0.5445
            temp[:, 1] = temp[:, 1] * 0.3484
            temp[:, 2] = temp[:, 2] * 0
            temp[:, 3] = temp[:, 3] * 0
            for r in activity_file:
                if r[1] != 0:
                    temp[r[0] - start][0] = 0
                    temp[r[0] - start + 1][0] = 0
                    temp[r[0] - start + 2][0] = 0
                else:
                    temp[r[0] - start][0] = 0.5445
            for r in audio_file:
                if r[1] != 0:
                    temp[r[0] - start][1] = 0
                    temp[r[0] - start + 1][1] = 0
                    temp[r[0] - start + 2][1] = 0
                else:
                    temp[r[0] - start][1] = 0.3484
            for r in phonelock_file:
                for rr in range(r[0], r[1]):
                    temp[rr - start][2] = 0.0512
            for r in phonecharge_file:
                for rr in range(r[0], r[1]):
                    temp[rr - start][3] = 0.0469
            time = 0
            for r in temp:
                if np.sum(r) > 0.9:
                    time += 1
            sleep[s] = time / (end - start)
            print(s)
        return sleep


In [2]:
root_path = 'C:\Users\kyrie\Desktop\comp9417\project\StudentLife_Dataset'
#root_path = os.path.join(os.path.dirname(__file__), 'StudentLife_Dataset')
data = project_data(root_path=root_path)


In [3]:
pre_studnet = data.pre_student
post_student = data.post_student


In [5]:
pre_flour, post_flour = data.FlourishingScale()
print(pre_flour)
print(post_flour)


defaultdict(<class 'int'>, {'u00': 47.0, 'u01': 45.0, 'u02': 46.0, 'u03': 34.0, 'u04': 27.0, 'u05': 48.0, 'u07': 49.0, 'u08': 37.0, 'u09': 46.0, 'u10': 39.0, 'u12': 49.0, 'u13': 44.0, 'u14': 52.0, 'u15': 43.0, 'u16': 42.0, 'u17': 37.0, 'u18': 37.0, 'u19': 42.0, 'u20': 45.0, 'u22': 46.0, 'u23': 35.0, 'u24': 41.0, 'u27': 31.0, 'u30': 52.0, 'u31': 16.0, 'u32': 54.0, 'u33': 31.0, 'u34': 49.0, 'u35': 48.0, 'u36': 46.0, 'u39': 15.0, 'u42': 45.0, 'u43': 47.0, 'u44': 48.0, 'u45': 48.0, 'u46': 42.0, 'u47': 47.0, 'u49': 51.0, 'u50': 48.0, 'u51': 39.0, 'u52': 34.0, 'u53': 50.0, 'u56': 46.0, 'u57': 50.0, 'u58': 51.0, 'u59': 43.0})
defaultdict(<class 'int'>, {'u00': 45.0, 'u01': 46.0, 'u02': 44.0, 'u03': 31.0, 'u04': 33.0, 'u05': 50.0, 'u07': 47.0, 'u09': 47.0, 'u10': 39.0, 'u14': 53.0, 'u15': 48.0, 'u16': 41.0, 'u17': 38.0, 'u19': 42.0, 'u20': 45.0, 'u23': 42.0, 'u24': 43.0, 'u27': 31.0, 'u30': 56.0, 'u31': 36.0, 'u32': 56.0, 'u33': 28.0, 'u34': 23.0, 'u35': 44.0, 'u36': 46.0, 'u42': 16.0, 'u43': 

In [7]:
pre_pos_panas, pre_neg_panas, post_pos_panas, post_neg_panas = data.PANAS()
print(pre_pos_panas)
print(pre_neg_panas)
print(post_pos_panas)
print(post_neg_panas)


defaultdict(<class 'int'>, {'u00': 32.0, 'u01': 30.0, 'u02': 23.0, 'u03': 30.0, 'u04': 27.0, 'u05': 32.0, 'u07': 30.0, 'u08': 30.0, 'u09': 28.0, 'u10': 33.0, 'u12': 42.0, 'u13': 13.0, 'u14': 40.0, 'u15': 30.0, 'u16': 18.0, 'u17': 31.0, 'u18': 16.0, 'u19': 31.0, 'u20': 29.0, 'u22': 26.0, 'u23': 29.0, 'u24': 26.0, 'u27': 23.0, 'u30': 34.0, 'u31': 31.0, 'u32': 29.0, 'u33': 19.0, 'u34': 28.0, 'u35': 27.0, 'u36': 32.0, 'u39': 20.0, 'u42': 24.0, 'u43': 31.0, 'u44': 32.0, 'u45': 27.0, 'u46': 34.0, 'u47': 34.0, 'u49': 35.0, 'u50': 25.0, 'u51': 24.0, 'u52': 40.0, 'u53': 28.0, 'u56': 22.0, 'u57': 35.0, 'u58': 28.0, 'u59': 34.0})
defaultdict(<class 'int'>, {'u00': 33.0, 'u01': 19.0, 'u02': 15.0, 'u03': 27.0, 'u04': 14.0, 'u05': 27.0, 'u07': 21.0, 'u08': 26.0, 'u09': 11.0, 'u10': 20.0, 'u12': 18.0, 'u13': 21.0, 'u14': 43.0, 'u15': 25.0, 'u16': 20.0, 'u17': 17.0, 'u18': 28.0, 'u19': 22.0, 'u20': 13.0, 'u22': 15.0, 'u23': 28.0, 'u24': 15.0, 'u27': 18.0, 'u30': 33.0, 'u31': 17.0, 'u32': 14.0, 'u33': 

In [None]:
pass


In [8]:
conv_freq_day, conv_freq_eve, conv_freq_nig = data.conversation_freq()
print(conv_freq_day)
print(conv_freq_eve)
print(conv_freq_nig)


defaultdict(<class 'float'>, {'u00': 1051.0, 'u01': 953.0, 'u02': 742.0, 'u03': 290.0, 'u04': 1432.0, 'u05': 920.0, 'u07': 531.0, 'u08': 1025.0, 'u09': 1307.0, 'u10': 1252.0, 'u12': 1028.0, 'u13': 931.0, 'u14': 875.0, 'u15': 534.0, 'u16': 759.0, 'u17': 863.0, 'u18': 471.0, 'u19': 897.0, 'u20': 496.0, 'u22': 472.0, 'u23': 730.0, 'u24': 445.0, 'u27': 812.0, 'u30': 1062.0, 'u31': 968.0, 'u32': 1153.0, 'u33': 460.0, 'u34': 584.0, 'u35': 814.0, 'u36': 994.0, 'u39': 194.0, 'u42': 583.0, 'u43': 928.0, 'u44': 945.0, 'u45': 576.0, 'u46': 870.0, 'u47': 408.0, 'u49': 849.0, 'u50': 406.0, 'u51': 983.0, 'u52': 322.0, 'u53': 778.0, 'u56': 800.0, 'u57': 1078.0, 'u58': 762.0, 'u59': 1679.0})
defaultdict(<class 'float'>, {'u00': 678.0, 'u01': 711.0, 'u02': 545.0, 'u03': 160.0, 'u04': 881.0, 'u05': 544.0, 'u07': 460.0, 'u08': 973.0, 'u09': 744.0, 'u10': 844.0, 'u12': 611.0, 'u13': 788.0, 'u14': 698.0, 'u15': 414.0, 'u16': 709.0, 'u17': 400.0, 'u18': 418.0, 'u19': 651.0, 'u20': 402.0, 'u22': 356.0, 'u23'

In [9]:
conv_dura_day, conv_dura_eve, conv_dura_nig = data.conversation_dura()
print(conv_dura_day)
print(conv_dura_eve)
print(conv_dura_nig)


defaultdict(<class 'float'>, {'u00': 976829.0, 'u01': 724456.0, 'u02': 696055.0, 'u03': 190186.0, 'u04': 813296.0, 'u05': 655971.0, 'u07': 444232.0, 'u08': 622104.0, 'u09': 934978.0, 'u10': 786533.0, 'u12': 819844.0, 'u13': 379189.0, 'u14': 749945.0, 'u15': 433509.0, 'u16': 485734.0, 'u17': 615728.0, 'u18': 406927.0, 'u19': 630558.0, 'u20': 240160.0, 'u22': 291268.0, 'u23': 361194.0, 'u24': 358664.0, 'u27': 440376.0, 'u30': 1015512.0, 'u31': 541007.0, 'u32': 886372.0, 'u33': 450027.0, 'u34': 361817.0, 'u35': 422991.0, 'u36': 417998.0, 'u39': 46149.0, 'u42': 400633.0, 'u43': 578474.0, 'u44': 468247.0, 'u45': 410655.0, 'u46': 534247.0, 'u47': 174913.0, 'u49': 474806.0, 'u50': 209903.0, 'u51': 492885.0, 'u52': 128738.0, 'u53': 521124.0, 'u56': 319866.0, 'u57': 671156.0, 'u58': 599928.0, 'u59': 683761.0})
defaultdict(<class 'float'>, {'u00': 401819.0, 'u01': 666806.0, 'u02': 452448.0, 'u03': 51797.0, 'u04': 495413.0, 'u05': 364188.0, 'u07': 259167.0, 'u08': 625278.0, 'u09': 671441.0, 'u10'

In [10]:
co_location = data.co_location()
print(co_location)


defaultdict(<class 'float'>, {'u00': 5584.0, 'u01': 5741.0, 'u02': 16563.0, 'u03': 2395.0, 'u04': 5148.0, 'u05': 57685.0, 'u07': 8254.0, 'u08': 15902.0, 'u09': 12040.0, 'u10': 13535.0, 'u12': 5458.0, 'u13': 171432.0, 'u14': 38941.0, 'u15': 4275.0, 'u16': 11171.0, 'u17': 8786.0, 'u18': 26094.0, 'u19': 18106.0, 'u20': 7165.0, 'u22': 3585.0, 'u23': 8412.0, 'u24': 6510.0, 'u27': 10001.0, 'u30': 67403.0, 'u31': 10916.0, 'u32': 41084.0, 'u33': 4799.0, 'u34': 4203.0, 'u35': 11302.0, 'u36': 127617.0, 'u39': 1663.0, 'u42': 5441.0, 'u43': 5971.0, 'u44': 10523.0, 'u45': 5409.0, 'u46': 29474.0, 'u47': 8796.0, 'u49': 182648.0, 'u50': 26664.0, 'u51': 36461.0, 'u52': 19612.0, 'u53': 11274.0, 'u56': 4117.0, 'u57': 61680.0, 'u58': 3706.0, 'u59': 8019.0})


In [11]:
activity_day, activity_eve, activity_nig = data.activity()
print(activity_day)
print(activity_eve)
print(activity_nig)


defaultdict(<class 'float'>, {'u00': 0.08464510820128836, 'u01': 0.05460767973222856, 'u02': 0.05063033125590646, 'u03': 0.014592951021728874, 'u04': 0.09271802176142675, 'u05': 0.03860504125487352, 'u07': 0.04380741535971253, 'u08': 0.04007109009777014, 'u09': 0.056696657810720905, 'u10': 0.031402037124274085, 'u12': 0.046480478362184434, 'u13': 0.03387088153694409, 'u14': 0.04115058266749834, 'u15': 0.05250207959371306, 'u16': 0.0371872432755814, 'u17': 0.040351652394367325, 'u18': 0.027503035590083345, 'u19': 0.0324531822779363, 'u20': 0.01907079478179171, 'u22': 0.038314223010992604, 'u23': 0.016013360624494623, 'u24': 0.03073088633940157, 'u27': 0.023003533786825698, 'u30': 0.060260468153415014, 'u31': 0.0430672752251966, 'u32': 0.061418041136002154, 'u33': 0.03561379715190834, 'u34': 0.07027669431437712, 'u35': 0.029759804267532407, 'u36': 0.05625326502755262, 'u39': 0.005056438962876388, 'u42': 0.04002576263247263, 'u43': 0.06924239639427099, 'u44': 0.01902950400111426, 'u45': 0

In [12]:
indoor_mobility_day, indoor_mobility_eve, indoor_mobility_nig = data.indoor_mobility()
print(indoor_mobility_day)
print(indoor_mobility_eve)
print(indoor_mobility_nig)


defaultdict(<class 'int'>, {'u00': 26180, 'u01': 14205, 'u02': 23422, 'u03': 7187, 'u04': 28950, 'u05': 12195, 'u07': 31689, 'u08': 17513, 'u09': 32728, 'u10': 15660, 'u12': 30792, 'u13': 37634, 'u14': 14708, 'u15': 13551, 'u16': 13189, 'u17': 17841, 'u18': 11386, 'u19': 22637, 'u20': 9685, 'u22': 23567, 'u23': 7049, 'u24': 6085, 'u27': 10525, 'u30': 39677, 'u31': 22930, 'u32': 41324, 'u33': 12294, 'u34': 11612, 'u35': 12889, 'u36': 47953, 'u39': 3880, 'u42': 25339, 'u43': 17086, 'u44': 12516, 'u45': 27470, 'u46': 20307, 'u47': 7907, 'u49': 35107, 'u50': 11391, 'u51': 31976, 'u52': 148908, 'u53': 24815, 'u56': 16864, 'u57': 42354, 'u58': 21605, 'u59': 39262})
defaultdict(<class 'int'>, {'u00': 15465, 'u01': 8603, 'u02': 10066, 'u03': 2957, 'u04': 17379, 'u05': 7873, 'u07': 18977, 'u08': 11186, 'u09': 16688, 'u10': 8743, 'u12': 14792, 'u13': 28680, 'u14': 11616, 'u15': 8736, 'u16': 7321, 'u17': 10736, 'u18': 7457, 'u19': 16042, 'u20': 5485, 'u22': 17817, 'u23': 3330, 'u24': 4537, 'u27':

In [13]:
distance_day, distance_eve, distance_nig = data.traveled_distance()
print(distance_day)
print(distance_eve)
print(distance_nig)


defaultdict(<class 'int'>, {'u00': 58.50862077999984, 'u01': 2.039335439999995, 'u02': 6.346208580000095, 'u03': 0.9332981500002404, 'u04': 21.01711453999942, 'u05': 1.9097226499992317, 'u07': 14.437643789999925, 'u08': 4.0688933000006315, 'u09': 8.484698799999912, 'u10': 1.6516502299994116, 'u12': 30.215704269999975, 'u13': 7.266625980000875, 'u14': 3.58260737999953, 'u15': 9.676528250000125, 'u16': 1.7735113199997983, 'u17': 1.4237323299992255, 'u18': 1.4329343000003405, 'u19': 2.853140309999432, 'u20': 0.364810699999488, 'u22': 57.33528751000055, 'u23': 0.5696746699999764, 'u24': 0.6954536799993534, 'u27': 0.967252109999734, 'u30': 22.601449620000672, 'u31': 18.227201619999782, 'u32': 2.3858496900003914, 'u33': 0.9047255499998812, 'u34': 3.778928099999746, 'u35': 2.6159017300000045, 'u36': 58.83883202000035, 'u39': 0.6662298999999194, 'u42': 5.943678000000098, 'u43': 11.650000789999694, 'u44': 3.735085969999929, 'u45': 6.452083559999863, 'u46': 7.309094319999147, 'u47': 2.8945579501

In [14]:
y_pre_flour = []
y_pre_pos_panas, y_pre_neg_panas = [], []
x_con_duration_day, x_con_duration_eve, x_con_duration_nig, x_con_duration  = [], [], [], []
x_con_freq_day, x_con_freq_eve, x_con_freq_nig, x_con_freq = [], [], [], []
x_co_location = []
x_activity_day, x_activity_eve, x_activity_nig, x_activity = [], [], [], []
x_traveled_day, x_traveled_eve ,x_traveled_nig, x_traveled = [], [], [], []
x_indoor_day, x_indoor_eve, x_indoor_nig, x_indoor = [], [], [], []
for stu_id in post_student:
    y_pre_flour.append(pre_flour[stu_id])
    y_pre_pos_panas.append(pre_pos_panas[stu_id])
    y_pre_neg_panas.append(pre_neg_panas[stu_id])
    x_con_duration_day.append(conv_dura_day[stu_id])
    x_con_duration_eve.append(conv_dura_eve[stu_id])
    x_con_duration_nig.append(conv_dura_nig[stu_id])
    x_con_duration.append(conv_dura_day[stu_id] + conv_dura_eve[stu_id] + conv_dura_nig[stu_id])
    x_con_freq_day.append(conv_freq_day[stu_id])
    x_con_freq_eve.append(conv_freq_eve[stu_id])
    x_con_freq_nig.append(conv_freq_nig[stu_id])
    x_con_freq.append(conv_freq_day[stu_id] + conv_freq_eve[stu_id] + conv_freq_nig[stu_id])
    x_co_location.append(co_location[stu_id])
    x_activity_day.append(activity_day[stu_id])
    x_activity_eve.append(activity_eve[stu_id])
    x_activity_nig.append(activity_nig[stu_id])
    x_activity.append(activity_day[stu_id] + activity_eve[stu_id] + activity_nig[stu_id])
    x_traveled_day.append(distance_day[stu_id])
    x_traveled_eve.append(distance_eve[stu_id])
    x_traveled_nig.append(distance_nig[stu_id])
    x_traveled.append(distance_day[stu_id] + distance_eve[stu_id] + distance_nig[stu_id])
    x_indoor_day.append(indoor_mobility_day[stu_id])
    x_indoor_eve.append(indoor_mobility_eve[stu_id])
    x_indoor_nig.append(indoor_mobility_nig[stu_id])
    x_indoor.append(indoor_mobility_day[stu_id] + 
                    indoor_mobility_eve[stu_id] + indoor_mobility_nig[stu_id])


In [15]:
print(len(x_con_duration_nig))
print('relation between x_con_duration_day, y_pre_flour')
a = np.array([x_con_duration_day, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_con_duration_eve, y_pre_flour')
a = np.array([x_con_duration_eve, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_con_duration_nig, y_pre_flour')
a = np.array([x_con_duration_nig, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_con_duration, y_pre_flour')
a = np.array([x_con_duration, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_con_freq_day, y_pre_flour')
a = np.array([x_con_freq_day, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_con_freq_eve, y_pre_flour')
a = np.array([x_con_freq_eve, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_con_freq_nig, y_pre_flour')
a = np.array([x_con_freq_nig, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_con_freq, y_pre_flour')
a = np.array([x_con_freq, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_co_location, y_pre_flour')
a = np.array([x_co_location, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_activity_day, y_pre_flour')
a = np.array([x_activity_day, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_activity_eve, y_pre_flour')
a = np.array([x_activity_eve, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_activity_nig, y_pre_flour')
a = np.array([x_activity_nig, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_activity, y_pre_flour')
a = np.array([x_activity, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_traveled_day, y_pre_flour')
a = np.array([x_traveled_day, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_traveled_eve, y_pre_flour')
a = np.array([x_traveled_eve, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_traveled_nig, y_pre_flour')
a = np.array([x_traveled_nig, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_traveled, y_pre_flour')
a = np.array([x_traveled, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_indoor_day, y_pre_flour')
a = np.array([x_indoor_day, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_indoor_eve, y_pre_flour')
a = np.array([x_indoor_eve, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_indoor_nig, y_pre_flour')
a = np.array([x_indoor_nig, y_pre_flour])
print(np.corrcoef(a)[0][1])

print('relation between x_indoor, y_pre_flour')
a = np.array([x_indoor, y_pre_flour])
print(np.corrcoef(a)[0][1])


37
relation between x_con_duration_day, y_pre_flour
0.1983739066738428
relation between x_con_duration_eve, y_pre_flour
0.2042490277411192
relation between x_con_duration_nig, y_pre_flour
-0.14515999842966334
relation between x_con_duration, y_pre_flour
0.155527468140591
relation between x_con_freq_day, y_pre_flour
0.025880647468060538
relation between x_con_freq_eve, y_pre_flour
-0.018519492284232095
relation between x_con_freq_nig, y_pre_flour
-0.34255481470889787
relation between x_con_freq, y_pre_flour
-0.09790565230722738
relation between x_co_location, y_pre_flour
0.2969586008751014
relation between x_activity_day, y_pre_flour
-0.05177020035421865
relation between x_activity_eve, y_pre_flour
0.009514203683879353
relation between x_activity_nig, y_pre_flour
-0.14592001223790724
relation between x_activity, y_pre_flour
-0.055002680986590044
relation between x_traveled_day, y_pre_flour
0.060374972343818875
relation between x_traveled_eve, y_pre_flour
0.0921635484545618
relation betw

In [16]:
print('relation between x_con_duration_day, y_pre_pos_panas')
a = np.array([x_con_duration_day, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_duration_eve, y_pre_pos_panas')
a = np.array([x_con_duration_eve, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_duration_nig, y_pre_pos_panas')
a = np.array([x_con_duration_nig, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_duration, y_pre_pos_panas')
a = np.array([x_con_duration, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_freq_day, y_pre_pos_panas')
a = np.array([x_con_freq_day, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_freq_eve, y_pre_pos_panas')
a = np.array([x_con_freq_eve, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_freq_nig, y_pre_pos_panas')
a = np.array([x_con_freq_nig, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_freq, y_pre_pos_panas')
a = np.array([x_con_freq, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_co_location, y_pre_pos_panas')
a = np.array([x_co_location, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_activity_day, y_pre_pos_panas')
a = np.array([x_activity_day, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_activity_eve, y_pre_pos_panas')
a = np.array([x_activity_eve, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_activity_nig, y_pre_pos_panas')
a = np.array([x_activity_nig, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_activity, y_pre_pos_panas')
a = np.array([x_activity, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_traveled_day, y_pre_pos_panas')
a = np.array([x_traveled_day, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_traveled_eve, y_pre_pos_panas')
a = np.array([x_traveled_eve, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_traveled_nig, y_pre_pos_panas')
a = np.array([x_traveled_nig, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_traveled, y_pre_pos_panas')
a = np.array([x_traveled, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_indoor_day, y_pre_pos_panas')
a = np.array([x_indoor_day, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_indoor_eve, y_pre_pos_panas')
a = np.array([x_indoor_eve, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_indoor_nig, y_pre_pos_panas')
a = np.array([x_indoor_nig, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_indoor, y_pre_pos_panas')
a = np.array([x_indoor, y_pre_pos_panas])
print(np.corrcoef(a)[0][1])


relation between x_con_duration_day, y_pre_pos_panas
0.12152985965264672
relation between x_con_duration_eve, y_pre_pos_panas
0.0006990450602570299
relation between x_con_duration_nig, y_pre_pos_panas
0.2743110453817675
relation between x_con_duration, y_pre_pos_panas
0.12256094947033484
relation between x_con_freq_day, y_pre_pos_panas
0.14497860676193638
relation between x_con_freq_eve, y_pre_pos_panas
0.061749485165868076
relation between x_con_freq_nig, y_pre_pos_panas
0.16081366257541624
relation between x_con_freq, y_pre_pos_panas
0.1388477345747584
relation between x_co_location, y_pre_pos_panas
0.3308093505510031
relation between x_activity_day, y_pre_pos_panas
0.3228592522866612
relation between x_activity_eve, y_pre_pos_panas
0.3437281554330784
relation between x_activity_nig, y_pre_pos_panas
0.5461085625817818
relation between x_activity, y_pre_pos_panas
0.40448277632475554
relation between x_traveled_day, y_pre_pos_panas
0.22160975743631986
relation between x_traveled_eve, y

In [17]:
print('relation between x_con_duration_day, y_pre_neg_panas')
a = np.array([x_con_duration_day, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_duration_eve, y_pre_neg_panas')
a = np.array([x_con_duration_eve, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_duration_nig, y_pre_neg_panas')
a = np.array([x_con_duration_nig, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_duration, y_pre_neg_panas')
a = np.array([x_con_duration, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_freq_day, y_pre_neg_panas')
a = np.array([x_con_freq_day, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_freq_eve, y_pre_neg_panas')
a = np.array([x_con_freq_eve, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_freq_nig, y_pre_neg_panas')
a = np.array([x_con_freq_nig, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_con_freq, y_pre_neg_panas')
a = np.array([x_con_freq, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_co_location, y_pre_neg_panas')
a = np.array([x_co_location, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_activity_day, y_pre_neg_panas')
a = np.array([x_activity_day, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_activity_eve, y_pre_neg_panas')
a = np.array([x_activity_eve, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_activity_nig, y_pre_neg_panas')
a = np.array([x_activity_nig, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_activity, y_pre_neg_panas')
a = np.array([x_activity, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_traveled_day, y_pre_neg_panas')
a = np.array([x_traveled_day, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_traveled_eve, y_pre_neg_panas')
a = np.array([x_traveled_eve, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_traveled_nig, y_pre_neg_panas')
a = np.array([x_traveled_nig, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_traveled, y_pre_neg_panas')
a = np.array([x_traveled, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_indoor_day, y_pre_neg_panas')
a = np.array([x_indoor_day, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_indoor_eve, y_pre_neg_panas')
a = np.array([x_indoor_eve, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_indoor_nig, y_pre_neg_panas')
a = np.array([x_indoor_nig, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])

print('relation between x_indoor, y_pre_neg_panas')
a = np.array([x_indoor, y_pre_neg_panas])
print(np.corrcoef(a)[0][1])


relation between x_con_duration_day, y_pre_neg_panas
0.035831973717329546
relation between x_con_duration_eve, y_pre_neg_panas
0.017213670931264535
relation between x_con_duration_nig, y_pre_neg_panas
0.1659312098315138
relation between x_con_duration, y_pre_neg_panas
0.061322885542421365
relation between x_con_freq_day, y_pre_neg_panas
-0.27174011925262886
relation between x_con_freq_eve, y_pre_neg_panas
-0.16851345200852155
relation between x_con_freq_nig, y_pre_neg_panas
0.02498374095052547
relation between x_con_freq, y_pre_neg_panas
-0.18217957159883374
relation between x_co_location, y_pre_neg_panas
-0.030688204619597658
relation between x_activity_day, y_pre_neg_panas
0.13594015059827147
relation between x_activity_eve, y_pre_neg_panas
0.21116406346102262
relation between x_activity_nig, y_pre_neg_panas
0.3818380883335803
relation between x_activity, y_pre_neg_panas
0.2286259929215219
relation between x_traveled_day, y_pre_neg_panas
0.0013310688543055676
relation between x_trave

In [23]:
c = 0
for key in post_flour:
    c += pre_flour[key]
c = c / len(pre_flour)
print(c)
y_flour = []
for key in post_flour:
    if pre_flour[key] > c:
        y_flour.append(1)
    else:
        y_flour.append(0)
y_flour = np.array(y_flour)


34.45652173913044


In [24]:
x = []
for a, b, c, d, e, f, g in zip(x_con_freq_nig, x_co_location, x_con_duration_eve, 
                    x_con_duration_day, x_activity_nig, x_con_duration_nig, x_traveled_nig):
    x.append([a,b,c,d,e,f,g])
x = np.array(x)
print(len(x))


37


In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
k_range = range(1, 25)
weight_options = ['uniform', 'distance']
param_grid = {'n_neighbors':k_range,'weights':weight_options}
knn = KNeighborsClassifier(n_neighbors=1)
grid = GridSearchCV(estimator=knn, param_grid=param_grid, scoring='accuracy', cv=5)


In [38]:
print(len(x), len(x[0]))
print(x[:, :6].shape)
grid.fit(x[:, :1], y_flour)
# print('网格搜索-度量记录：',grid.cv_results_)  
print('网格搜索-最佳度量值:',grid.best_score_)  
print('网格搜索-最佳参数：',grid.best_params_)  
print('网格搜索-最佳模型：',grid.best_estimator_) 


37 7
(37, 6)
网格搜索-最佳度量值: 0.8378378378378378
网格搜索-最佳参数： {'n_neighbors': 7, 'weights': 'uniform'}
网格搜索-最佳模型： KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')




In [41]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
clf = DecisionTreeClassifier(random_state=42)
paramaters = {'max_depth': [2, 4, 6, 8, 10], 'min_samples_leaf': [2, 4, 6, 8, 10], 
              'min_samples_split': [2, 4, 6, 8, 10]}
grid = GridSearchCV(estimator=clf, param_grid=paramaters, scoring='accuracy', cv=10)


In [44]:
grid.fit(x[:, :1], y_flour)
# print('网格搜索-度量记录：',grid.cv_results_)  
print('网格搜索-最佳度量值:',grid.best_score_)  
print('网格搜索-最佳参数：',grid.best_params_)  
print('网格搜索-最佳模型：',grid.best_estimator_)




网格搜索-最佳度量值: 0.8378378378378378
网格搜索-最佳参数： {'max_depth': 2, 'min_samples_leaf': 8, 'min_samples_split': 2}
网格搜索-最佳模型： DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')


In [45]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
clf = SVC()
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
grid = GridSearchCV(estimator=clf, param_grid=tuned_parameters, cv=10, scoring='accuracy')


In [47]:
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
print(x[:,:1].shape)
grid.fit(x, y_flour)
# print('网格搜索-度量记录：',grid.cv_results_)  
print('网格搜索-最佳度量值:',grid.best_score_)  
print('网格搜索-最佳参数：',grid.best_params_)  
print('网格搜索-最佳模型：',grid.best_estimator_) 


(37, 1)
网格搜索-最佳度量值: 0.8378378378378378
网格搜索-最佳参数： {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
网格搜索-最佳模型： SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)




In [48]:
x1 = []
for a, b, c, d, e, f, g in zip( x_activity_nig, x_activity, x_indoor_nig, x_indoor
                               ,x_con_duration_nig, x_traveled, x_con_freq_nig):
    x1.append([a,b,c,d,e,f,g])
x1 = np.array(x1)
print(len(x1))


37


In [54]:
c = 0
for key in post_student:
    c += post_pos_panas[key]
c = c / len(post_pos_panas)
print(c)
y_pos_panas = []
for key in post_student:
    if post_pos_panas[key] > c:
        y_pos_panas.append(1)
    else:
        y_pos_panas.append(0)
y_pos_panas = np.array(y_pos_panas)
print(y_pos_panas.shape)


27.794871794871796
(37,)


In [55]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
k_range = range(1, 25)
weight_options = ['uniform', 'distance']
param_grid = {'n_neighbors':k_range,'weights':weight_options}
knn = KNeighborsClassifier(n_neighbors=1)
grid = GridSearchCV(estimator=knn, param_grid=param_grid, scoring='accuracy', cv=5)


In [63]:
print(len(x1), len(x1[0]))
grid.fit(x1[:, :3], y_pos_panas)
# print('网格搜索-度量记录：',grid.cv_results_)  
print('网格搜索-最佳度量值:',grid.best_score_)  
print('网格搜索-最佳参数：',grid.best_params_)  
print('网格搜索-最佳模型：',grid.best_estimator_) 


37 7
网格搜索-最佳度量值: 0.7297297297297297
网格搜索-最佳参数： {'n_neighbors': 7, 'weights': 'uniform'}
网格搜索-最佳模型： KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')




In [64]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
clf = DecisionTreeClassifier(random_state=42)
paramaters = {'max_depth': [2, 4, 6, 8, 10], 'min_samples_leaf': [2, 4, 6, 8, 10], 
              'min_samples_split': [2, 4, 6, 8, 10]}
grid = GridSearchCV(estimator=clf, param_grid=paramaters, scoring='accuracy', cv=10)


In [71]:
grid.fit(x1[:, :3], y_pos_panas)
# print('网格搜索-度量记录：',grid.cv_results_)  
print('网格搜索-最佳度量值:',grid.best_score_)  
print('网格搜索-最佳参数：',grid.best_params_)  
print('网格搜索-最佳模型：',grid.best_estimator_)


网格搜索-最佳度量值: 0.6756756756756757
网格搜索-最佳参数： {'max_depth': 2, 'min_samples_leaf': 8, 'min_samples_split': 2}
网格搜索-最佳模型： DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')




In [72]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
clf = SVC()
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
grid = GridSearchCV(estimator=clf, param_grid=tuned_parameters, cv=10, scoring='accuracy')


In [79]:
scaler = MinMaxScaler()
x1 = scaler.fit_transform(x1)
print(x.shape)
grid.fit(x1[:, :], y_pos_panas)
# print('网格搜索-度量记录：',grid.cv_results_)  
print('网格搜索-最佳度量值:',grid.best_score_)  
print('网格搜索-最佳参数：',grid.best_params_)  
print('网格搜索-最佳模型：',grid.best_estimator_) 



(37, 7)
网格搜索-最佳度量值: 0.6756756756756757
网格搜索-最佳参数： {'C': 1000, 'kernel': 'linear'}
网格搜索-最佳模型： SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)




In [80]:
x2 = []
for a, b, c, d, e, f, g in zip(x_activity_nig,  x_con_freq_day, x_indoor_nig,  x_activity
                               ,x_con_freq, x_con_duration_nig, x_traveled_eve):
    x2.append([a,b,c,d,e,f,g])
x2 = np.array(x2)
print(len(x2))


37


In [81]:
c = 0
for key in post_student:
    c += post_neg_panas[key]
c = c / len(post_neg_panas)
print(c)
y_neg_panas = []
for key in post_student:
    print(key)
    if post_neg_panas[key] > c:
        y_neg_panas.append(1)
    else:
        y_neg_panas.append(0)
y_neg_panas = np.array(y_neg_panas)


20.102564102564102
u00
u01
u02
u03
u04
u05
u07
u09
u10
u14
u15
u16
u17
u19
u20
u23
u24
u27
u30
u31
u32
u33
u34
u35
u36
u42
u43
u44
u45
u46
u47
u49
u51
u52
u53
u56
u59


In [83]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
k_range = range(1, 25)
weight_options = ['uniform', 'distance']
param_grid = {'n_neighbors':k_range,'weights':weight_options}
knn = KNeighborsClassifier(n_neighbors=1)
grid = GridSearchCV(estimator=knn, param_grid=param_grid, scoring='accuracy', cv=5)


In [96]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(len(x2), len(x2[0]))
x3 = scaler.fit_transform(x2)
grid.fit(x3[:, :5], y_neg_panas)
# print('网格搜索-度量记录：',grid.cv_results_)  
print('网格搜索-最佳度量值:',grid.best_score_)  
print('网格搜索-最佳参数：',grid.best_params_)  
print('网格搜索-最佳模型：',grid.best_estimator_) 


37 7
网格搜索-最佳度量值: 0.5945945945945946
网格搜索-最佳参数： {'n_neighbors': 1, 'weights': 'uniform'}
网格搜索-最佳模型： KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')




In [97]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
clf = DecisionTreeClassifier(random_state=42)
paramaters = {'max_depth': [2, 4, 6, 8, 10], 'min_samples_leaf': [2, 4, 6, 8, 10], 
              'min_samples_split': [2, 4, 6, 8, 10]}
grid = GridSearchCV(estimator=clf, param_grid=paramaters, scoring='accuracy', cv=10)


In [102]:
grid.fit(x2[:, :5], y_neg_panas)
# print('网格搜索-度量记录：',grid.cv_results_)  
print('网格搜索-最佳度量值:',grid.best_score_)  
print('网格搜索-最佳参数：',grid.best_params_)  
print('网格搜索-最佳模型：',grid.best_estimator_)


网格搜索-最佳度量值: 0.6486486486486487
网格搜索-最佳参数： {'max_depth': 4, 'min_samples_leaf': 4, 'min_samples_split': 2}
网格搜索-最佳模型： DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')




In [103]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
clf = SVC()
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
grid = GridSearchCV(estimator=clf, param_grid=tuned_parameters, cv=10, scoring='accuracy')


In [111]:
scaler = MinMaxScaler()
x2 = scaler.fit_transform(x2)
print(x.shape)
grid.fit(x2[:, :4], y_neg_panas)
# print('网格搜索-度量记录：',grid.cv_results_)  
print('网格搜索-最佳度量值:',grid.best_score_)  
print('网格搜索-最佳参数：',grid.best_params_)  
print('网格搜索-最佳模型：',grid.best_estimator_) 




(37, 7)
网格搜索-最佳度量值: 0.6216216216216216
网格搜索-最佳参数： {'C': 100, 'kernel': 'linear'}
网格搜索-最佳模型： SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


