In [2]:
import plotly.express as px
import pandas as pd
import chart_studio.plotly as cs
import plotly

# Top 10 Real

In [3]:
top10real = pd.read_csv("Data/real_top10_meaningful_words_freq.csv")

In [4]:
top10real.drop("Unnamed: 0",axis=1,inplace=True)

In [5]:
top10real.head(5)

Unnamed: 0,Word,Frequency
0,said,2595
1,Trump,1973
2,Clinton,1693
3,would,1555
4,I,1345


In [42]:
# Display bar chart, colour by Word
bar_fig = px.bar(top10real,x = 'Word',y = 'Frequency',title="Top 10 Words That Appear in Real News Articles",color= "Word",color_discrete_sequence=px.colors.qualitative.T10,width = 500)
# set config
bar_fig.update_layout(showlegend=False,title={
    "xanchor":"center",
    "yanchor":"top",
    'y':0.9,
    'x':0.5,})
bar_fig.show()

In [43]:
cs.iplot(bar_fig, filename = "top10real")

# Top 10 Fake

In [44]:
top10fake = pd.read_csv("Data/fake_top10_meaningful_words_freq.csv")

In [45]:
top10fake.drop("Unnamed: 0",axis=1,inplace=True)

In [46]:
top10fake.head(5)

Unnamed: 0,Word,Frequency
0,I,535
1,Trump,356
2,people,336
3,would,315
4,Hillary,279


In [47]:
# Display bar chart, colour by Word
bar_fig2 = px.bar(top10real,x = 'Word',y = 'Frequency',title="Top 10 Words That Appear in Fake News Articles",color= "Word",color_discrete_sequence=px.colors.qualitative.T10,width=500)
# set config
bar_fig2.update_layout(showlegend=False,title={
    "xanchor":"center",
    "yanchor":"top",
    'y':0.9,
    'x':0.5,})
bar_fig2.show()

In [48]:
cs.iplot(bar_fig2, filename = "top10fake")

# Top 20 Contributing Words Fake

In [13]:
fake_cont = pd.read_csv("Data/contribution_top20_fake.csv")

In [14]:
fake_cont.drop("Unnamed: 0",axis=1,inplace=True)

In [15]:
f_c_bar = px.bar(fake_cont,x = "Word", y = "Contribution", title = "Top 20 Contributing Words (Fake Outcome)",color = "Word",color_discrete_sequence=px.colors.qualitative.T10,width =900)
f_c_bar.update_layout(showlegend=False,
    title = {
        "x":0.5
    }
)

In [16]:
cs.iplot(f_c_bar, filename = "top20fake")

# Top 20 Contributing Words Real

In [17]:
real_cont = pd.read_csv("Data/contribution_top20_words_real.csv")

In [18]:
real_cont["Contribution"] = -real_cont["Contribution"]

In [19]:
real_cont.drop("Unnamed: 0",axis=1,inplace=True)

In [49]:
r_c_bar = px.bar(real_cont,x = "Word", y = "Contribution", title = "Top 20 Contributing Words (Real Outcome)",color = "Word",color_discrete_sequence=px.colors.qualitative.T10,width =900)
f_c_bar.update_layout(showlegend=False,
    title = {
        "x":0.5
    }
)

In [50]:
cs.iplot(r_c_bar, filename = "top20real")

# Number of Fake/Real Articles

In [22]:
#read data
news = pd.read_excel("Data/final_news.xlsx")

In [23]:
# drop unwanted colummn
news.drop("Unnamed: 0",axis=1,inplace=True)

In [24]:
# convert 0 and 1 to strings
news["Target"] = news["Target"].replace(0,"Real").replace(1,"Fake")

In [25]:
# plot proportion of real vs fake articles 
pie_fig = px.pie(
    news,
    names = "Target",
    title = "Proportion of Real Vs Fake News Articles",
    color_discrete_map={
        "Real":"#636efa",
        "Fake":"#ef553b"
    },
    width=500)

In [26]:
# set config
pie_fig.update_layout(title={
    "xanchor":"center",
    "yanchor":"top",
    'y':0.9,
    'x':0.5
    },
    legend= dict(orientation= "h",
    xanchor="center",
    x = 0.5
    ))

In [27]:
# save to chart studio account
cs.iplot(pie_fig, filename = "Prop_of_Real_vs_Fake_Articles")

# Number of Tweets for Fake/Real Artciles

In [28]:
# group by target varibale
grouped_By_Target = news.groupby("Target")
#get the sum of each column
summed_Tweets = grouped_By_Target.sum()
summed_Tweets.reset_index(inplace=True)
# plot proportion of summed tweets for real vs fake articles
pie_fig2 = px.pie(
    summed_Tweets,
    names = summed_Tweets.Target, 
    values = summed_Tweets.Number_of_tweets, 
    color=summed_Tweets.Target, 
    title = "Proportion of Tweets in Real Vs Fake News Articles",
    color_discrete_map={
        "Real":"#636efa",
        "Fake":"#ef553b"
    },
    width = 500)

In [29]:
# set config
pie_fig2.update_layout(
    title={
        "xanchor":"center",
        "yanchor":"top",
        'y':0.9,
        'x':0.5
    },
    legend= dict(
        orientation= "h",
        xanchor="center",
        x = 0.5
    ))

In [30]:
cs.iplot(pie_fig2, filename = "Prop_of_Tweets_in_Real_vs_Fake_Articles")

# Average Title Length

In [31]:
# find average lengths of each title
avg_lengths = grouped_By_Target.mean()
avg_lengths.reset_index(inplace=True)

In [32]:
avg_lengths

Unnamed: 0,Target,Title_length,Body_length,Number_of_tweets,corpus_length_original
0,Fake,13.328323,441.868839,0.918587,274.166946
1,Real,9.919063,362.305105,0.008015,233.882471


In [33]:
# create bar chart of average title lengths for real and fake articles
avg_title = px.bar(
    avg_lengths,
    x = 'Target',
    y = 'Title_length',
    title="Average Title Length",
    color= "Target",
    color_discrete_map={
        "Real":"#636efa",
        "Fake":"#ef553b"
    },
    width = 500)
avg_title.update_layout(
    title = {
        "xanchor":"center",
        "x" : 0.5,
    }
)

In [34]:
cs.iplot(avg_title,filename = "avg_title_length")

# Average Body Length

In [35]:
# create bar chart of average body length for real and fake articles
avg_body = px.bar(
    avg_lengths, 
    x = "Target", 
    y = "Body_length", 
    color="Target",
    title= "Average Body Length",
    color_discrete_map={
        "Real":"#636efa",
        "Fake":"#ef553b"
    },
    width = 500)
avg_body.update_layout(
    title={
        "x":0.5
    }
)
avg_body.show()

In [36]:
cs.iplot(avg_body,filename = "avg_body_length")

In [37]:
# group by target variable and title_length
count_titles = news.groupby(["Target","Title_length"]).count().reset_index()
count_titles.rename(columns={"Article_title":"Count"},inplace=True)
count_titles = count_titles[["Target","Count","Title_length"]]
count_titles.head(5)

Unnamed: 0,Target,Count,Title_length
0,Fake,10,3
1,Fake,26,4
2,Fake,52,5
3,Fake,89,6
4,Fake,157,7


In [40]:
line_title = px.line(
    count_titles,
    x = "Title_length",
    y = "Count",
    color="Target", 
    title = "Distribution of Article Title Length vs Count",
    color_discrete_map={
        "Real":"#636efa",
        "Fake":"#ef553b"
    },
    width=1000)
line_title.update_layout(
        title = {
            "x":0.5
        }
)

In [41]:
cs.iplot(line_title,filename = "dist_title_length")