 1.The inputs of the models should only include user-related features, e.g. number of twits the user has posted, etc. Any inputs related with a single twit are not valid inputs. You should create suitable features for promising prediction.
 
2.You cannot use the follower number itself as a feature.
 
3.You should re-manage the twits data-set from http://www.ams.jhu.edu/~dan/stock_twits/twits.txt, and split the whole dataset so that 3/4 of the user data are used for training while rest for testing.

4.You finally your should print two confusion matrices as outputs.

5.OPTIONAL: You can apply random forest models on this problem and compare it with naive bayes and logistic regression


# 获取数据

In [277]:
import requests
import json
import time
import numpy as np

In [169]:
req = requests.get('http://www.ams.jhu.edu/~dan/stock_twits/twits.txt')
# 最后一个不要，因为是空的
text_list = req.text.split('\n')[:-1]

# 清洗数据

In [170]:
len(text_list)

2288

In [171]:
json_list = [json.loads(i) for i in text_list]

In [172]:
len(json_list[0]['messages'])

30

In [173]:
twit_list = []

for json in json_list:
    twit_list.extend(json['messages'])


In [174]:
id_list = []
clean_twit_list = []

for i in range(len(twit_list)):
    twit = twit_list.pop()
    if twit['id'] not in id_list:
        id_list.append(twit['id'])
        clean_twit_list.append(twit)


In [178]:
sorted_twit_list = sorted(clean_twit_list, key=lambda x:x['user']['followers'])

In [179]:
len(sorted_twit_list)

4964

In [176]:
clean_user_list = []
user_id_list = []
for twit in clean_twit_list:
    if twit['user']['id'] not in user_id_list:
        user_id_list.append(twit['user']['id'])
        clean_user_list.append(twit['user'])

In [177]:
len(clean_user_list)

1561

In [180]:
sorted_user_list = sorted(clean_user_list, key=lambda x:x['followers'])

In [188]:
sorted_user_list[-1]

{'avatar_url_ssl': 'https://avatars.stocktwits.com/production/170/thumb-1537898606.png',
 'join_date': '2009-08-31',
 'username': 'StockTwits',
 'identity': 'Official',
 'following': 10000,
 'official': True,
 'like_count': 49109,
 'name': 'StockTwits',
 'id': 170,
 'followers': 489995,
 'watchlist_stocks_count': 18,
 'ideas': 74125,
 'avatar_url': 'https://avatars.stocktwits.com/production/170/thumb-1537898606.png',
 'classification': ['suggested', 'official']}

In [288]:
# 时间字符串转数字
def time_to_int(time_str):
    timeArray = time.strptime(time_str, "%Y-%m-%d")
    return int(time.mktime(timeArray))

In [305]:
# followers 中位数
median = sorted_user_list[781]['followers']

28



创建array:sorted_user_array

一共6个列：包含5 x个参数 和 1 个y结果
join_date, following, like_count, ideas, classification


In [296]:
sorted_user_array = np.zeros((len(sorted_user_list),6))
print(sorted_user_array)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [336]:
for row in range(len(sorted_user_list)):
    sorted_user_array[row][0] = time_to_int(sorted_user_list[row]['join_date'])
    sorted_user_array[row][1] = sorted_user_list[row]['following'] if sorted_user_list[row]['following']>=0 else 0
    sorted_user_array[row][2] = sorted_user_list[row]['like_count']
    sorted_user_array[row][3] = sorted_user_list[row]['ideas']
    sorted_user_array[row][4] = len(sorted_user_list[row]['classification'])
    sorted_user_array[row][5] = 1 if sorted_user_list[row]['followers'] >28 else 0


In [386]:
len(sorted_user_array)/4*3
np.random.choice(range(5),3,replace=True)

array([2, 4, 3])

In [364]:
test_array = sorted_user_array[::4]
train_array = np.array([sorted_user_array[i] for i in range(len(sorted_user_array)) if i%4!=0 ])

# 分析数据

In [342]:
def fun(attr):
    res = {}
    res['large'] = [i[attr] for i in sorted_user_list[-10:]]
    res['little'] = [i[attr] for i in sorted_user_list[:10]]
    return res
    
fun('watchlist_stocks_count')

{'large': [3, 20, 1, 0, 138, 0, 19, 0, 0, 18],
 'little': [29, 99, 19, 3, 0, 2, 24, 9, 1, 2]}

这些是我的参数：join_date, identity, following, like_count, ideas, classification

### naive_bayes

In [379]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
clf = MultinomialNB()
clf.fit(train_array[:, :5], train_array[: ,5])

y_predict=clf.predict(test_array[:, :5])
confusion_matrix(test_array[: ,5], y_predict)

array([[194,   2],
       [100,  95]])

### Logistic Regression

In [382]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(train_array[:, :5], train_array[: ,5])
y_predict=clf.predict(test_array[:, :5])
confusion_matrix(test_array[: ,5], y_predict)



array([[193,   3],
       [ 75, 120]])

### Decision tree

In [383]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_array[:, :5], train_array[: ,5])8
y_predict=clf.predict(test_array[:, :5])
confusion_matrix(test_array[: ,5], y_predict)

array([[159,  37],
       [ 49, 146]])