In [2]:
'''
data processing (first step)
'''
import pandas as pd
#load the raw data
data = pd.read_csv('raw.csv')
#classify the weibo based on the hour it was released
def split_time(x):
    hour = x['created_at'].hour
    if hour<6:
        return 0
    if hour<9:
        return 1
    if hour<12:
        return 2
    if hour<14:
        return 3
    if hour<18:
        return 4
    return 5

# quantificate the indexes
data['created_at'] = pd.to_datetime(data['created_at'])
data['time'] = data.apply(lambda x:split_time(x),axis=1)
data['at_num'] = data.apply(lambda x:x['content'].count('@'),axis=1)
data['hash_num'] = data.apply(lambda x:x['content'].count('#'),axis=1)
data['has_video'] = data.apply(lambda x:0 if pd.isnull(x['video_url']) else 1,axis=1)
data['has_image'] = data.apply(lambda x:0 if pd.isnull(x['image_url']) else 1,axis=1)
data['is_origin'] = data.apply(lambda x:1 if pd.isnull(x['origin_weibo']) else 0,axis=1)
data['year'] = data.apply(lambda x:pd.to_datetime(x['created_at']).year,axis=1)
data=data.drop('Unnamed: 0',axis=1)

# dimensionality reduction
data = data.join(pd.get_dummies(data.time))
data = data.join(pd.get_dummies(data.year))
data.to_csv('Weibo.csv')

In [None]:
import sys
!{sys.executable} -m pip install jieba

In [None]:
'''
cluster of different news type
For this part, the result may be different every time clustering was run, so the result may differ from what we used in the report
'''

import jieba
import pickle
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
import codecs
from sklearn.manifold import TSNE

#put the content column into a csv file 
y = data['content']
y.to_csv('content.csv')
#drop the index of content.csv
data = pd.read_csv('content.csv')
data.columns = ['index','content']
data = data.drop(['index'], axis=1)
data.to_csv('content1.csv')

def getStopWords():
    """
    creat a list of 'stopwords' 
    as the base to split the content into list of words
    """
    stopwords = []
    #import the txt file
    for word in open("stopwords.txt", "r",encoding='utf-8'):
        stopwords.append(word.strip())
    return stopwords

def load_articles():
    """
    split the content into words
    """
    #get the content and stopwords
    stop_words=getStopWords()
    data = pd.read_csv('content1.csv')
    #show the process
    print("clustering now ...")
    #cut the content to words and put words that are not in the stopwords list into one index (with whitespace between) 
    data['content'] = data['content'].apply(
        lambda x: " ".join([word for word in jieba.cut(str(x)) if word not in stop_words]))
    
    #put these words (one index for one Weibo) into a list
    articles = []
    for  content in data['content'].tolist():
        article =  content
        articles.append(article)
    return articles

def transform(articles, n_features=1000):
    """
    get the data of tf-idf of each words
    """
    #get the vectorized result of Tfidf of the words in each Weibo
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=5, use_idf=True)
    X = vectorizer.fit_transform(articles)
    return X, vectorizer


def train(X, vectorizer, true_k=10, mini_batch=False, show_label=False):
    """
    cluster the data with k-means algrithm
    """
    #MiniBatch K-means algorithm's main idea is to use small random batches of data of a fixed size
    #MiniBatch K-means is used when the dataset is large, to reduce the time and memory of programming
    if mini_batch:
        k_means = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                                  init_size=1000, batch_size=1000, verbose=False)
    #when the dataset is not large, just use the K-means algrithm 
    else:
        k_means = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
                         verbose=False)
    k_means.fit(X)
    
    # show the distribution of the classes
    def plot1(something):
        tsne = TSNE(n_components=2)
        decomposition_data = tsne.fit_transform(something)
        x = []
        y = []
        for i in decomposition_data:
            x.append(i[0])
            y.append(i[1])
        fig = plt.figure(figsize=(10, 10))
        ax = plt.axes()
        plt.scatter(x, y, c=k_means.labels_, marker="x")
        plt.xticks(())
        plt.yticks(())
        plt.show()

    if show_label: 
        print("Top terms per cluster:")
        #find the key words 
        order_centroids = k_means.cluster_centers_.argsort()[:, ::-1]
        terms = vectorizer.get_feature_names()
        for i in range(true_k):
            print("Cluster %d" % i, end='')
            #show the most important 10 key words
            for ind in order_centroids[i, :10]:
                print(' %s' % terms[ind], end='')
            print()
    
    #put the results of clustering of each Weibo in the list
    result = list(k_means.predict(X))
    plot1(X.toarray())
    datanew = pd.read_csv('Weibo.csv')
    resultpd = pd.Series(list(k_means.predict(X)))
    datanew['type'] = resultpd
    #join the result of the clustering to the dataframe
    datanew = datanew.join(pd.get_dummies(datanew.type))
    datanew.to_csv('Weibo&cluster.csv')
    #show the result
    print('Cluster distribution:')
    print(dict([(i, result.count(i)) for i in result]))
    #return the purity of k-means algrithm
    return -k_means.score(X)


def plot_params():
    """
    test and find the best parameter
    """
    #get all words in the content
    articles = load_articles()
    print("%d docments" % len(articles))
    #use tf-idf to show the distance of two contents
    X, vectorizer = transform(articles, n_features=500)
    true_ks = []
    scores = []
    #try k from 3 to 80 to find how many clusters should be used based on the purity
    for i in range(3, 80, 1):
        score = train(X, vectorizer, true_k=i) / len(articles)
        true_ks.append(i)
        scores.append(score)
    #plot a figure to show the trend
    plt.figure(figsize=(8, 4))
    plt.plot(true_ks, scores, label="error", color="red", linewidth=1)
    plt.xlabel("n_features")
    plt.ylabel("error")
    plt.legend()
    plt.show()

def out():
    """
    show the reslut of clustering with the parameter of highest purity
    """
    articles = load_articles()
    X, vectorizer = transform(articles, n_features=500)
    score = train(X, vectorizer, true_k=10, show_label=True) / len(articles)

out()


In [26]:
#make adaption of the clustering by hand
dataclu=pd.read_csv('Weibo&cluster.csv')
dataclu['爆炸事故']=dataclu['0.1']+dataclu['3.1']
dataclu['地质灾害']=dataclu['1.1']
dataclu['航空事故']=dataclu['2.1']
dataclu['地震灾害']=dataclu['4.1']+dataclu['6']
dataclu['刑事案件']=dataclu['5.1']+dataclu['8']
dataclu['森林火灾']=dataclu['7']
dataclu['病毒流感']=dataclu['9']
dataclu=dataclu.drop(['0.1','1.1','2.1','3.1','4.1','5.1','6','7','8','9','Unnamed: 0', 'Unnamed: 0.1'],axis=1)
#rename the columns
dataclu.rename(columns={'0':'0:6','1':'6:9','2':'9:12','3':'12:14','4':'14:18','5':'18:24'}, inplace = True)


#dataclu.to_csv('dataclustered.csv')



Index(['id', 'comment_num', 'repost_num', 'like_num', 'created_at',
       'image_url', 'video_url', 'origin_weibo', 'content', 'user_id',
       'fans_num', 'vip_level', 'tweets_num', 'time', 'at_num', 'hash_num',
       'has_video', 'has_image', 'is_origin', 'year', '0:6', '6:9', '9:12',
       '12:14', '14:18', '18:24', '2010', '2011', '2012', '2013', '2014',
       '2015', '2016', '2017', '2018', '2019', 'type', '爆炸事故', '地质灾害', '航空事故',
       '地震灾害', '刑事案件', '森林火灾', '病毒流感'],
      dtype='object')

In [None]:
'''
Multinomial Naive Bayes--predicttion of comment_num,like_num,repost_num
'''
#set the factors used for prediction
import pandas as pd
data = pd.read_csv('dataclustered.csv')
df=data.drop(['Unnamed: 0','id','comment_num','repost_num','like_num','created_at','image_url','video_url','origin_weibo','content','user_id','time','year'],axis=1)
X = df
# import the packages needed
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

list_accuracy=list()
#set the lables of comment_num
bins = [-1, 50, 100, 1000, 136265]
data['comment_num'] = pd.cut(data['comment_num'], bins,labels=[1,2,3,4],right=True)
y = data['comment_num']
#predict the number of comment
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
model = MultinomialNB().fit(X_train, y_train)
predicted = model.predict(X_test)
list_accuracy.append(accuracy_score(y_test, predicted))

#set the lables of like_num
bins = [-1,75 , 500, 1000, 306161]
data['like_num'] = pd.cut(data['like_num'], bins,labels=[1,2,3,4],right=True)
y = data['like_num']
#predict the number of likes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
model = MultinomialNB().fit(X_train, y_train)
predicted = model.predict(X_test)
list_accuracy.append(accuracy_score(y_test, predicted))

#set the lables of like_num
bins = [-1, 100, 500, 1000,  565454]
data['repost_num'] = pd.cut(data['repost_num'], bins,labels=[1,2,3,4],right=True)
y = data['repost_num']
#predict the number of likes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
model = MultinomialNB().fit(X_train, y_train)
predicted = model.predict(X_test)
list_accuracy.append(accuracy_score(y_test, predicted))

print(list_accuracy)


In [None]:
'''
KNN--predicttion of comment_num,like_num,repost_num
'''
#set the factors used for prediction
import pandas as pd
data = pd.read_csv('dataclustered.csv')
df=data.drop(['Unnamed: 0','id','comment_num','repost_num','like_num','created_at','image_url','video_url','origin_weibo','content','user_id','time','year'],axis=1)
X = df
# import the packages needed
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

list_accuracy=list()
#set the lables of comment_num
bins = [-1, 50, 100, 1000, 136265]
data['comment_num'] = pd.cut(data['comment_num'], bins,labels=[1,2,3,4],right=True)
y = data['comment_num']
# predict the number of conmments
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
classifier = KNeighborsClassifier(n_neighbors=4)
classifier.fit(X_train, y_train);
y_pred = classifier.predict(X_test)
list_accuracy.append(accuracy_score(y_test, y_pred))

#set the lables of like_num
bins = [-1,75 , 500, 1000, 306161]
data['like_num'] = pd.cut(data['like_num'], bins,labels=[1,2,3,4],right=True)
y = data['like_num']
# predict the number of likes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
classifier = KNeighborsClassifier(n_neighbors=4)
classifier.fit(X_train, y_train);
y_pred = classifier.predict(X_test)
list_accuracy.append(accuracy_score(y_test, y_pred))

#set the lables of repost_num
bins = [-1, 100, 500, 1000,  565454]
data['repost_num'] = pd.cut(data['repost_num'], bins,labels=[1,2,3,4],right=True)
y = data['repost_num']
# predict the number of reposts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
classifier = KNeighborsClassifier(n_neighbors=4)
classifier.fit(X_train, y_train);
y_pred = classifier.predict(X_test)
list_accuracy.append(accuracy_score(y_test, y_pred))

print(list_accuracy)


In [None]:
'''
SVM--predicttion of comment_num,like_num,repost_num
'''
#set the factors used for prediction
import pandas as pd
data = pd.read_csv('dataclustered.csv')
df=data.drop(['Unnamed: 0','id','comment_num','repost_num','like_num','created_at','image_url','video_url','origin_weibo','content','user_id','time','year'],axis=1)
X = df

#import the packages needed
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

list_accuracy=list()
#set the lables of comment_num
bins = [-1, 50, 100, 1000, 136265]
data['comment_num'] = pd.cut(data['comment_num'], bins,labels=[1,2,3,4],right=True)
y = data['comment_num']
#predict the number of comments
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
svclassifier = SVC(kernel='rbf',gamma='scale')
svclassifier.fit(X_train, y_train);
y_pred = svclassifier.predict(X_test)
list_accuracy.append(accuracy_score(y_test, y_pred))

#set the lables of like_num
bins = [-1,75 , 500, 1000, 306161]
data['like_num'] = pd.cut(data['like_num'], bins,labels=[1,2,3,4],right=True)
y = data['like_num']
#predict the number of likes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
svclassifier = SVC(kernel='rbf',gamma='scale')
svclassifier.fit(X_train, y_train);
y_pred = svclassifier.predict(X_test)
list_accuracy.append(accuracy_score(y_test, y_pred))

#set the lables of repost_num
bins = [-1, 100, 500, 1000,  565454]
data['repost_num'] = pd.cut(data['repost_num'], bins,labels=[1,2,3,4],right=True)
y = data['repost_num']
#use the SVM package to predict the number of reposts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
svclassifier = SVC(kernel='rbf',gamma='scale')
svclassifier.fit(X_train, y_train);
y_pred = svclassifier.predict(X_test)
list_accuracy.append(accuracy_score(y_test, y_pred))

print(list_accuracy)


In [1]:
'''
Random Forest--predicttion of comment_num,like_num,repost_num
'''
#set the factors used for prediction
import pandas as pd
data = pd.read_csv('dataclustered.csv')
df=data.drop(['Unnamed: 0','id','comment_num','repost_num','like_num','created_at','image_url','video_url','origin_weibo','content','user_id','time','year'],axis=1)
X = df
#import packages needed
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

list_accuracy=list()
#set the lables of comment_num
bins = [-1, 50, 100, 1000, 136265]
data['comment_num'] = pd.cut(data['comment_num'], bins,labels=[1,2,3,4],right=True)
y = data['comment_num']
#predict the number of comments
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
classifier = RandomForestClassifier(n_estimators=200, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
list_accuracy.append(accuracy_score(y_test, y_pred))

#set the lables of like_num
bins = [-1,75 , 500, 1000, 306161]
data['like_num'] = pd.cut(data['like_num'], bins,labels=[1,2,3,4],right=True)
y = data['like_num']
#predict the number of likes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
classifier = RandomForestClassifier(n_estimators=200, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
list_accuracy.append(accuracy_score(y_test, y_pred))

#set the lables of repost_number
bins = [-1, 100, 500, 1000,  565454]
data['repost_num'] = pd.cut(data['repost_num'], bins,labels=[1,2,3,4],right=True)
y = data['repost_num']
#predict the number of likes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
classifier = RandomForestClassifier(n_estimators=200, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
list_accuracy.append(accuracy_score(y_test, y_pred))

print(list_accuracy)


[0.7642010625255414, 0.8167143440948099, 0.7754393134450347]


In [1]:
'''
Adjusted Random Forest algorithm
'''

import pandas as pd
import numpy as np
import random

def train_test_split(df, y, test_size): ##Split the train and test data
    df['label'] = y
    test_size = round(test_size * len(df))
    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)
    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    return train_df, test_df

def calculate_accuracy(predictions, labels): ##Calculate accuracy of prediction
    predictions_correct = (predictions == labels)
    accuracy = predictions_correct.mean()
    return accuracy

def check_purity(data): ##Check whether all the data are in the same class
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)
    if len(unique_classes) == 1: # this statement is satisfied only when the np.unique gives only a type
        return True
    else:
        return False

def classify_data(data): ##Classify the data according to different class in a factor
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
    index = counts_unique_classes.argmax() # take the class that appears the most
    classification = unique_classes[index]
    return classification

def calculate_entropy(data): ##Calculate the entropy of a situation
    label_column = data[:, -1]
    counts = np.unique(label_column, return_counts=True)[1]
    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
    return entropy

def calculate_overall_entropy(data_below, data_above): ##Add up the total entropy of a node
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n
    overall_entropy = (p_data_below * calculate_entropy(data_below)
                       + p_data_above * calculate_entropy(data_above))
    return overall_entropy

def split_data(data, split_column, split_value): ##Split a node
    split_column_values = data[:, split_column]   
    # determine whether it belongs to the below or the above using fancy indexing
    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values > split_value]
    return data_below, data_above

class RandomForest():
    def __init__(self, df, y, n_trees,n_bootstrap, n_features, dt_max_depth, min_samples):
        self.df, self.y, self.n_trees, self.n_bootstrap, self.n_features  = df, y, n_trees, n_bootstrap, n_features
        self.dt_max_depth, self.min_samples = dt_max_depth, min_samples
        self.train_df, self.test_df = train_test_split(df, y, test_size=0.2)
        self.trees, self.forest = self.random_forest_algorithm()

    def bootstrapping(self): ## Bagging (bootstrap aggregating)    
        # randomly select n_bootstrap number of pieces of data to build a tree later
        bootstrap_indices = np.random.randint(low=0, high=len(self.train_df), size=self.n_bootstrap)
        df_bootstrapped = self.train_df.iloc[bootstrap_indices]
        return df_bootstrapped

    def random_forest_algorithm(self): ##Build random forest (list)
        trees =[]
        forest = []
        for i in range(self.n_trees):
            df_bootstrapped = self.bootstrapping()
            tree = DecisionTree(df_bootstrapped, self.n_features, self.dt_max_depth, self.min_samples,random_subspace=self.n_features)
            trees.append(tree)
            forest.append(tree.decision_tree_algorithm(tree.df))
        return trees, forest

    def random_forest_predictions(self): ##Go through all the trees and return the class that appears the most for each piece of data
        df_predictions = {}
        for i in range(len(self.trees)):
            column_name = "tree_{}".format(i)
            predictions = self.trees[i].decision_tree_predictions(self.test_df, tree=self.forest[i])
            df_predictions[column_name] = predictions
        df_predictions = pd.DataFrame(df_predictions)
        random_forest_predictions = df_predictions.mode(axis=1)[0]
        return random_forest_predictions


class DecisionTree():
    def __init__(self, df, n_features, max_depth, min_samples,random_subspace=None):        
        self.df, self.n_features, self.max_depth, self.min_samples = df, n_features, max_depth, min_samples
        self.random_subspace = random_subspace

    def get_potential_splits(self,data): ##Potential splits
        potential_splits = {}
        column_indices = list(range(data.shape[1] - 1))  # excluding the last column (label)
        if self.random_subspace and self.random_subspace <= len(column_indices):
            # randomly select k columns from column_indices
            column_indices = random.sample(population=column_indices, k=self.random_subspace)
        for column_index in column_indices:
            values = data[:, column_index]
            unique_values = np.unique(values)
            potential_splits[column_index] = unique_values
        return potential_splits

    def determine_best_split(self,data, potential_splits): ##Choose the split with lowest entropy
        overall_entropy = 102102
        # go through all the potential splits
        for column_index in potential_splits:
            for value in potential_splits[column_index]:
                data_below, data_above = split_data(data, split_column=column_index, split_value=value)
                current_overall_entropy = calculate_overall_entropy(data_below, data_above)             
                # compare total entropy and select
                if current_overall_entropy <= overall_entropy:
                    overall_entropy = current_overall_entropy
                    best_split_column = column_index
                    best_split_value = value
        return best_split_column, best_split_value

    def decision_tree_algorithm(self, df, counter=0): ##Build a decision tree by dictionary
        # data preparations
        if counter == 0:
            global COLUMN_HEADERS
            COLUMN_HEADERS = self.df.columns
            data = self.df.values
        else:
            data = df
        # all the conditions that indicates the end of splitting a tree
        if (check_purity(data)) or (len(data) < self.min_samples) or (counter == self.max_depth):
            classification = classify_data(data)
            return classification
        # recursion
        else:
            counter += 1 # in case it reaches the man_depth
            # helper functions
            potential_splits = self.get_potential_splits(data)
            split_column, split_value = self.determine_best_split(data, potential_splits)
            data_below, data_above = split_data(data, split_column, split_value)
            # return if data is already empty
            if len(data_below) == 0 or len(data_above) == 0:
                classification = classify_data(data)
                return classification
            # determine question
            feature_name = COLUMN_HEADERS[split_column]
            # if print the tree, the sturcture would be like this
            question = "{} <- {}".format(feature_name, split_value)
            # instantiate sub-tree
            sub_tree = {question: []}
            # find answers (recursion)
            yes_answer = self.decision_tree_algorithm(data_below, counter)
            no_answer = self.decision_tree_algorithm(data_above, counter)
            # If the answers are the same, then there is no point in asking the question.
            if yes_answer == no_answer:
                sub_tree = yes_answer
            else:
                sub_tree[question].append(yes_answer)
                sub_tree[question].append(no_answer)
            return sub_tree

    def predict_example(self,example, tree): ##Prediction that goes down a tree
        # print(tree)
        question = list(tree.keys())[0]
        feature_name, comparison_operator, value = question.split(" ")
        # splitting queistion
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
        # return if it is already not a dictionary (which means it reaches the end)
        if type(answer) != dict:
            return answer
        # recursion
        else:
            residual_tree = answer
            return self.predict_example(example, residual_tree)

    # All examples of the test data
    def decision_tree_predictions(self,test_df, tree):
        predictions = test_df.apply(self.predict_example, args=(tree,), axis=1)
        return predictions


In [None]:
'''
Use our adjusted Random Forest algorithm for prediction
'''
#set the dataFrame to predict comment_num
data = pd.read_csv('dataclustered.csv')
df=data.drop(['Unnamed: 0','id','comment_num','repost_num','like_num','created_at','image_url','video_url','origin_weibo','content','user_id','time','year'],axis=1)
bins = [-1, 50, 100, 1000, 136265]
data['comment'] = pd.cut(data['comment_num'], bins,labels=[1,2,3,4],right=True)
# Build random forest class
My_Random_Forest = RandomForest(df, data['comment'], n_trees=200,n_bootstrap=100, n_features=6, dt_max_depth=10, min_samples=2)
predictions = My_Random_Forest.random_forest_predictions()
accuracy = calculate_accuracy(predictions,My_Random_Forest.test_df.label)
print(accuracy)

#set the dataFrame to predict like_num
data = pd.read_csv('dataclustered.csv')
df=data.drop(['Unnamed: 0','id','comment_num','repost_num','like_num','created_at','image_url','video_url','origin_weibo','content','user_id','time','year'],axis=1)
bins = [-1,75 , 500, 1000, 306161]
data['like'] = pd.cut(data['like_num'], bins,labels=[1,2,3,4],right=True)
# Build random forest class
My_Random_Forest = RandomForest(df, data['like'], n_trees=200,n_bootstrap=100, n_features=6, dt_max_depth=10, min_samples=2)
predictions = My_Random_Forest.random_forest_predictions()
accuracy = calculate_accuracy(predictions,My_Random_Forest.test_df.label)
print(accuracy)

#set the dataFrame to predict repost_num
data = pd.read_csv('dataclustered.csv')
df=data.drop(['Unnamed: 0','id','comment_num','repost_num','like_num','created_at','image_url','video_url','origin_weibo','content','user_id','time','year'],axis=1)
bins =  [-1, 100, 500, 1000,  565454]
data['repost'] = pd.cut(data['repost_num'], bins,labels=[1,2,3,4],right=True)
# Build random forest class
My_Random_Forest = RandomForest(df, data['repost'], n_trees=200,n_bootstrap=100, n_features=6, dt_max_depth=10, min_samples=2)
predictions = My_Random_Forest.random_forest_predictions()
accuracy = calculate_accuracy(predictions,My_Random_Forest.test_df.label)
print(accuracy)

In [None]:
'''
Find and visualize the most important 5 features for the prediction model
'''
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('dataclustered.csv')
data=df.drop(['Unnamed: 0','id','comment_num','repost_num','like_num','type','created_at','image_url','video_url','origin_weibo','content','user_id','time','year'],axis=1)
bins = [-1, 50, 100, 1000, 136265]
df['comment_num'] = pd.cut(df['comment_num'], bins,labels=[1,2,3,4],right=True)
X = data
bins = [-1, 100, 500, 1000,  565454]
df['repost_num'] = pd.cut(df['repost_num'], bins,labels=[1,2,3,4],right=True)
bins = [-1,75 , 500, 1000, 306161]
df['like_num'] = pd.cut(df['like_num'], bins,labels=[1,2,3,4],right=True)
print(X)
#y = df['comment_num']
#y = df['repost_num']
#y = df['like_num']

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=100,
                              random_state=0)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
    
df = pd.read_csv('dataclustered.csv')
df = df.drop(['Unnamed: 0','id'],axis=1)
df[df['type']==9].describe()

# Plot the feature importances of the forest
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.collections import LineCollection

%matplotlib inline
map_vir = cm.get_cmap(name='Greens')
n=5
norm = plt.Normalize(importances[indices][:n].min(), importances[indices][:n].max())
norm_y = norm(importances[indices][:n])
color = map_vir(norm_y)
plt.figure()
plt.title("Feature importances for like number")
#plt.title("Feature importances for comment number")
#plt.title("Feature importances for repost number")

plt.bar(range(5), importances[indices][:5],
       color=color, yerr=std[indices][:5], align="center")
#plt.xticks(range(5), ['fans','tweets','hash','at','vip'])
#plt.xticks(range(5), ['fans','tweets','2013','hash','2012'])
#plt.xticks(range(5), ['fans','tweets','hash','at','vip'])
#plt.xticks(range(5), indices)
plt.xlim([-1, 5])
plt.show();
