-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
136 lines (113 loc) · 4.52 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# -*- coding: utf-8 -*-
import ast
import pandas as pd
import pickle
from younet_rnd_infrastructure.tri.common import utils
from younet_rnd_infrastructure.tri.common import file_tool
from younet_rnd_infrastructure.tri.facebook_crawl_unit.get_list_friends import get_list_friends
SEP = '#$@%()'
def extract_data():
data_df = pd.read_csv('./input/data_700k.csv', encoding='utf-8')
school_info_name = []
school_info_type = []
school_info_year = []
data_df['education'].fillna('{}', inplace=True)
for i in range(data_df.shape[0]):
print i
education_info = ast.literal_eval(data_df.loc[i, 'education'])
schools_name = []
schools_type = []
schools_year = []
for school in education_info:
school_name = ''
school_type = ''
school_year = ''
if 'school' in school.keys():
school_name = school['school']['name']
if 'type' in school.keys():
school_type = school['type']
if 'year' in school.keys():
school_year = school['year']['name']
schools_name.append(school_name)
schools_type.append(school_type)
schools_year.append(school_year)
if len(schools_name) != 0:
school_info_name.append(reduce(lambda x, y: x + SEP + y, schools_name))
else:
school_info_name.append('')
if len(schools_type) != 0:
school_info_type.append(reduce(lambda x, y: x + SEP + y, schools_type))
else:
school_info_type.append('')
if len(schools_year) != 0:
school_info_year.append(reduce(lambda x, y: x + SEP + y, schools_year))
else:
school_info_year.append('')
data_df['school_name'] = school_info_name
data_df['school_type'] = school_info_type
data_df['school_year'] = school_info_year
data_df.to_csv('./temp/data_detail.csv', encoding='utf-8', index=None)
print 'Done'
def extract_data_v2():
data_df = pd.read_csv('./input/data_700k.csv', encoding='utf-8')
data_df['education'].fillna('{}', inplace=True)
result = []
for i in range(data_df.shape[0]):
print i
education_info = ast.literal_eval(data_df.loc[i, 'education'])
for school in education_info:
school_name = ''
school_type = ''
school_year = ''
if 'school' in school.keys():
school_name = school['school']['name']
if 'type' in school.keys():
school_type = school['type']
if 'year' in school.keys():
school_year = school['year']['name']
record = data_df.iloc[i, :].to_dict()
record['school_raw_name'] = school_name
record['school_type'] = school_type
record['school_year'] = school_year
result.append(record)
result_df = pd.DataFrame(result)
result_df.to_csv('./temp/data_detail.csv', encoding='utf-8', index=None)
print 'Done'
return result_df
def clean_data(data_detail_df):
data_detail_df.dropna(axis=0, subset=['school_raw_name'], how='any', inplace=True)
data_detail_df.index = range(data_detail_df.shape[0])
return data_detail_df
def build_set_school(data_df):
school_names = []
school_types = []
for i in range(data_df.shape[0]):
print '%s/%s' % (i, data_df.shape[0])
names = data_df.loc[i, 'school_name'].split(SEP)
types = data_df.loc[i, 'school_type'].split(SEP)
school_names.extend(names)
school_types.extend(types)
if len(school_names) != len(school_types):
print i
raise Exception
df = pd.DataFrame({'name': school_names, 'type': school_types})
return df
def read_dump_file(filename):
with open(filename, 'rb') as f:
tmp = pickle.load(f)
return tmp
if __name__ == '__main__':
data_df = pd.read_csv('./temp/data_detail.csv', encoding='utf-8')
print 'Done'
# list_ids = list(set(list(data_df['id'])))
# # 109.609470844 s -> 50 id
# friends = utils.time_measure(utils.run_multi_url_request_with_time_constraint,
# [get_list_friends.get_friends_id, 1, [list_ids[:50]]])
friendships = file_tool.load_json('./temp/friends_0_1k.json')
def convert_list_to_dict(list_friendships):
friendships = dict()
for item in list_friendships:
friendships[item['id']] = item['friends']
return friendships
x = convert_list_to_dict(friendships)
print 'Done'