In [1]:
USERNAME = "ajufrjakova-372071"
 
import datetime
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#from pymorphy2 import MorphAnalyzer
import os
import socket
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, length, when, col
from pyspark.sql.types import BooleanType, IntegerType, LongType, StringType, ArrayType, FloatType, StructType, StructField
import pyspark.sql.functions as F
from pyspark.sql import types as T
from pyspark.sql.functions import pandas_udf
from pyspark.sql.functions import PandasUDFType
from jinja2 import Environment, FileSystemLoader
 
os.environ["PYSPARK_PYTHON"] = "/opt/conda/bin/python3.8"
SPARK_ADDRESS = "local[4]"
LOCAL_IP = socket.gethostbyname(socket.gethostname())
 
APP_NAME = "practice_current"
 
 
# run spark
spark = SparkSession\
    .builder\
    .appName(APP_NAME)\
    .master(SPARK_ADDRESS)\
    .config('spark.ui.port', "4040")\
    .config("spark.memory.fraction", "0.8")\
    .config("spark.memory.storageFraction", "0.6")\
    .config("spark.driver.memory", "4g")\
    .getOrCreate()

# Loading splitted data

In [2]:
summary_df = spark.read.parquet('hdfs:///project-dirs/2023-14000/split_data/summary.parquet')

title_df = spark.read.parquet('hdfs:///project-dirs/2023-14000/split_data/title.parquet')

main_df = spark.read.parquet('hdfs:///project-dirs/2023-14000/split_data/main.parquet')

link_df = spark.read.parquet('hdfs:///project-dirs/2023-14000/split_data/link.parquet')

In [3]:
link_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- id: string (nullable = true)



# Preprocessing titles and summaries

In [4]:
def text_preprocess(df: "pyspark.sql.dataframe.DataFrame",
           F: "pyspark.sql.functions",
           T: "pyspark.sql.types",
           broadcast_func: "spark.sparkContext.broadcast") -> 'Tuple["pyspark.sql.dataframe.DataFrame"]':
    
    stop_words = nltk.corpus.stopwords.words("russian")
    stop_words_dict = broadcast_func(stop_words)
    
    @F.udf(returnType=T.StringType())
    def preproc(text):
        # remove numbers, extra spaces and some non-alphabetic characters
        text = text.replace('%', ' процент')
        text = text.replace('&quot', ' ')
        text = re.sub('й', 'й', text)
        text = re.sub('Й', 'Й', text)
        text = re.sub("[^0-9A-Za-zа-яА-ЯёЁ ]", " ", text)
        text = re.sub(' +', ' ', text).lstrip()
        # convert to lower case
        text = text.lower()
        text = text.split(" ")
        # remove stop words and lemmatization
        #text = [MorphAnalyzer().normal_forms(token)[0] for token in text]
        text = [word for word in text if not word in stop_words]
        text = " ".join(text)
        return text
    
    modified_df = df.where(F.col('text').isNotNull()).withColumn('text_proc', preproc('text'))\
    .select('id', 'text_proc')

    
    return modified_df


In [5]:
preproc_summary_df = text_preprocess(
    df=summary_df,
    F=F,
    T=T,
    broadcast_func=spark.sparkContext.broadcast
)

In [6]:
preproc_title_df = text_preprocess(
    df=title_df,
    F=F,
    T=T,
    broadcast_func=spark.sparkContext.broadcast
)

In [7]:
preproc_summary_df.show(10)
preproc_title_df.show(10)

+----+--------------------+
|  id|           text_proc|
+----+--------------------+
|ID_0|                    |
|ID_1|барселона договор...|
|ID_2|временное правите...|
|ID_3|газпром выставил ...|
|ID_4|дочка французской...|
|ID_5|единая россия нам...|
|ID_6|единая россия пре...|
|ID_7|зенит обжаловал р...|
|ID_8|локомотив своем о...|
|ID_9|локомотив одержал...|
+----+--------------------+
only showing top 10 rows

+----+--------------------+
|  id|           text_proc|
+----+--------------------+
|ID_0|                    |
|ID_1|12 виолончелистов...|
|ID_2|24 часа ле мана и...|
|ID_3|2morrow завтра мо...|
|ID_4|8 первых свиданий...|
|ID_5|a e a a a a c n i...|
|ID_6|coca cola поделил...|
|ID_7|do it гараже 400 ...|
|ID_8|forward us цукерб...|
|ID_9|gm автоваз повыси...|
+----+--------------------+
only showing top 10 rows



# Preprocessing types

In [8]:
def type_preprocess(df: "pyspark.sql.dataframe.DataFrame",
           F: "pyspark.sql.functions",
           T: "pyspark.sql.types",
           broadcast_func: "spark.sparkContext.broadcast") -> 'Tuple["pyspark.sql.dataframe.DataFrame"]':
    
    stop_words = nltk.corpus.stopwords.words("russian")
    stop_words_dict = broadcast_func(stop_words)
    
    @F.udf(returnType=T.StringType())
    def preproc(text):
        # remove numbers, extra spaces and some non-alphabetic characters
        text = re.sub('й', 'й', text)
        text = re.sub('Й', 'Й', text)
        text = re.sub("[^0-9A-Za-zа-яА-ЯёЁ ]", " ", text)
        text = re.sub(' +', ' ', text).lstrip()
        # convert to lower case
        text = text.lower()
        text = text.split(" ")
        # remove stop words and lemmatization
        #text = [MorphAnalyzer().normal_forms(token)[0] for token in text]
        text = [word for word in text if not word in stop_words]
        text = " ".join(text)
        return text
    
    modified_df = df.where(F.col('type').isNotNull()).withColumn('type_proc', preproc('type'))

    
    return modified_df

In [9]:
preproc_type_df = type_preprocess(
    df=main_df,
    F=F,
    T=T,
    broadcast_func=spark.sparkContext.broadcast
)

# Preprocessing date_parsed

In [10]:
def date_preprocess(df: "pyspark.sql.dataframe.DataFrame",
           F: "pyspark.sql.functions",
           T: "pyspark.sql.types") -> 'Tuple["pyspark.sql.dataframe.DataFrame"]':
    
    @F.udf(returnType=T.StringType())
    def preproc_date(input_date):
        # performs preprocessing of date input_date as string
        try:
            output_date = re.sub(r'\D', '/', input_date).split('/')
            if len(output_date[2]) == 4:
                output_date[0], output_date[2] = output_date[2], output_date[0]
            output_date = '/'.join(output_date)
        except:
            output_date = None
        return output_date
    
    modified_df = df.where(F.col('date_parsed').isNotNull()).withColumn('date_parsed_proc', preproc_date('date_parsed'))\

    
    return modified_df

In [11]:
preproc_main_df = date_preprocess(
    df=preproc_type_df,
    F=F,
    T=T
)

In [12]:
preproc_main_df.show(10)

+------------+----------+------+-----------+----------+---------+----------+------------+----------------+
|        type|      date|source|date_parsed|  title_id|  link_id|summary_id|   type_proc|date_parsed_proc|
+------------+----------+------+-----------+----------+---------+----------+------------+----------------+
|    ОБЩЕСТВО|2011-10-27|   aif| 2011-10-27|ID_1071283|ID_815561|ID_1353702|    общество|      2011/10/27|
|ПРОИСШЕСТВИЯ|2011-10-27|   aif| 2011-10-27|ID_1811736|  ID_8540|ID_2353835|происшествия|      2011/10/27|
|    ОБЩЕСТВО|2011-10-27|   aif| 2011-10-27| ID_814905|ID_557641|ID_2304122|    общество|      2011/10/27|
|    ОБЩЕСТВО|2011-10-27|   aif| 2011-10-27|ID_1141948|ID_557640|ID_1366343|    общество|      2011/10/27|
|ПРОИСШЕСТВИЯ|2011-10-27|   aif| 2011-10-27|ID_1763618| ID_87843|ID_1908113|происшествия|      2011/10/27|
|ПРОИСШЕСТВИЕ|2011-10-27|   aif| 2011-10-27| ID_982516|ID_245197| ID_744592|происшествие|      2011/10/27|
|       СПОРТ|2011-10-27|   aif| 2011

# Get unique tokens from titles, summaries and types

In [13]:
def get_vocab(df: "pyspark.sql.dataframe.DataFrame",
           F: "pyspark.sql.functions",
           T: "pyspark.sql.types") -> 'Tuple["pyspark.sql.dataframe.DataFrame"]':

    def text_to_words(text):
        words = [word for word in text.split(' ')]
        return words

    textToWords = udf(text_to_words, returnType=ArrayType(StringType()))
    
    word_counts_df = df\
    .where(col('text_proc').isNotNull())\
    .select(F.explode(textToWords('text_proc')).name('word'))\
    .groupby('word')\
    .count()\
    .orderBy('count', ascending=False)
    
    
    return  word_counts_df

In [14]:
def get_types(df: "pyspark.sql.dataframe.DataFrame",
           F: "pyspark.sql.functions",
           T: "pyspark.sql.types") -> 'Tuple["pyspark.sql.dataframe.DataFrame"]':
    
    word_counts_df = df\
    .where(col('type_proc').isNotNull())\
    .select('type')\
    .groupby('type')\
    .count()\
    .orderBy('count', ascending=False)
    
    
    return  word_counts_df

In [15]:
summary_unique = get_vocab(
    df=preproc_summary_df,
    F=F,
    T=T
)

In [16]:
title_unique = get_vocab(
    df=preproc_title_df,
    F=F,
    T=T
)

In [17]:
types_unique = get_types(
    df=preproc_main_df,
    F=F,
    T=T
)

In [18]:
summary_unique.show(10)
title_unique.show(10)
types_unique.show(10)

+-------+-------+
|   word|  count|
+-------+-------+
|       |2874810|
| россии| 564478|
|   nbsp| 329041|
|    это| 321706|
|   года| 286193|
| заявил| 279331|
|    сша| 268881|
|     рф| 231744|
|области| 206782|
|  также| 199914|
+-------+-------+
only showing top 10 rows

+-------+------+
|   word| count|
+-------+------+
|       |626017|
| россии|263179|
|    сша|199898|
|     рф|116571|
| москве| 90603|
|  путин| 86329|
|области| 77732|
|человек| 69847|
| против| 63380|
| заявил| 61166|
+-------+------+
only showing top 10 rows

+--------------+-------+
|          type|  count|
+--------------+-------+
|     Экономика|1537288|
|      Политика| 937839|
|        В мире| 748367|
|      Общество| 739195|
|Лента новостей| 698124|
|           Мир| 649760|
|         Спорт| 627528|
|        Россия| 476762|
|  Происшествия| 457470|
|      Культура| 451611|
+--------------+-------+
only showing top 10 rows



In [19]:
summary_unique.count()

1001161

In [20]:
title_unique.count()

465575

In [21]:
types_unique.count()

12344

# Lemmatization of vocabularies

Skipped, because MorphAnalyzer package is not installed
This part was performed locally and new vocabularies were pushed to cluster

# Lematization of texts in titles, summaries and types using lemmatized vocabularies

In [None]:
summary_voc = spark.read.csv('hdfs:///project-dirs/2023-14000/split_data/voc_summary.csv', header=True)

title_voc = spark.read.csv('hdfs:///project-dirs/2023-14000/split_data/voc_title.csv', header=True)

type_voc = spark.read.csv('hdfs:///project-dirs/2023-14000/split_data/types.csv', header=True)

In [None]:
sum_voc = summary_voc.toPandas()
sum_voc = dict(zip(sum_voc['word'], sum_voc['lemma']))

ttl_voc = title_voc.toPandas()
ttl_voc = dict(zip(ttl_voc['word'], ttl_voc['lemma']))

tp_voc = type_voc.toPandas()
tp_voc = dict(zip(tp_voc['type'], tp_voc['reformed_type']))

In [None]:
def text_to_lemma(df: "pyspark.sql.dataframe.DataFrame",
           F: "pyspark.sql.functions",
           T: "pyspark.sql.types",
           text_voc: dict,
           broadcast_func: "spark.sparkContext.broadcast") -> 'Tuple["pyspark.sql.dataframe.DataFrame"]':

    text_dict = broadcast_func(text_voc)    
    
    @F.udf(returnType=T.StringType())
    def lemmatize_text(text):
            text = text.split(" ")
            text = [text_dict.value.get(word, word) for word in text]
            output_text = " ".join(text)       
            return output_text

    modified_df = df.where(F.col('text_proc').isNotNull()).withColumn('lemmatized', lemmatize_text('text_proc'))\
    .select('id', 'text_proc','lemmatized')

    
    return modified_df


In [None]:
def type_to_lemma(df: "pyspark.sql.dataframe.DataFrame",
           F: "pyspark.sql.functions",
           T: "pyspark.sql.types",
           text_voc: dict,
           broadcast_func: "spark.sparkContext.broadcast") -> 'Tuple["pyspark.sql.dataframe.DataFrame"]':

    text_dict = broadcast_func(text_voc)    
    
    @F.udf(returnType=T.StringType())
    def lemmatize_text(text):
            output_text = text_dict.value.get(text, text)      
            return output_text

    modified_df = df.where(F.col('type_proc').isNotNull()).withColumn('lemmatized', lemmatize_text('type_proc'))\
    .drop('date_parsed', 'type_proc', 'type')\
    .select('date', 'source' , 'title_id', 'link_id', 'summary_id', F.col('lemmatized').alias('type'), F.col('date_parsed_proc').alias('date_parsed'))\
    
    

    
    return modified_df

In [None]:
lem_summary_df = text_to_lemma(
    df=preproc_summary_df,
    F=F,
    T=T,
    text_voc = sum_voc,
    broadcast_func=spark.sparkContext.broadcast
)

In [None]:
lem_title_df = text_to_lemma(
    df=preproc_title_df,
    F=F,
    T=T,
    text_voc = ttl_voc,
    broadcast_func=spark.sparkContext.broadcast
)

In [None]:
lem_main_df = type_to_lemma(
    df=preproc_main_df,
    F=F,
    T=T,
    text_voc = tp_voc,
    broadcast_func=spark.sparkContext.broadcast
)

In [None]:
lem_summary_df.show(10)
lem_title_df.show(10)
lem_main_df.show(10)

# Data Analysis


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc_context

## Loading markers

In [None]:
summary = spark.read.csv('hdfs:///project-dirs/2023-14000/summary_marked.csv', header=True)
title = spark.read.csv('hdfs:///project-dirs/2023-14000/title_marked.csv', header=True)

## 1.Top words

Skipped, because of memory issues

## 2.Sentiments Rates

In [None]:
from collections import Counter

@F.udf(returnType=T.IntegerType())
def collect0(labels):
    counts = Counter(labels)
    if 0 not in counts.keys():
        return 0
    return counts[0]

@F.udf(returnType=T.IntegerType())
def collect1(labels):
    counts = Counter(labels)
    if 1 not in counts.keys():
        return 0
    return counts[1]

In [None]:
@F.udf(returnType=T.StringType())
def sources_to_groups(source):
    pro_rus = ['aif', 'argumenti', 'fontanka', 'interfax', 'izvestia', 'kp', 'lenta', 'mail', 'pravda', 'rbc', 'regnum',
               'rg', 'ria', 'rosbalt', 'tass', 'ura', 'vedomosti', 'vesti', 'yandex']
    pro_opp = ['currenttime', 'mediazona', 'meduza', 'novyagazeta', 'svoboda', 'thebell', 'tvrain']
    if source in pro_opp:
        return 'pro_opposite'
    else:
        return 'pro_russian'

In [17]:
def grouper(main, cols_to_group, max_groups=None):
    grouped = main.select(*cols_to_group, 'label') \
                  .groupby(cols_to_group) \
                  .agg(collect0(F.col('label')).alias('neg_count'),
                       collect1(F.col('label')).alias('pos_count'))
    grouped = grouped.withColumn('total', grouped['neg_count'] + grouped['pos_count']) \
                     .orderBy('total', ascending=False)
    if max_groups is not None:
        grouped = grouped.limit(max_groups)
    grouped = grouped.withColumn('neg_rate', grouped['neg_count'] / ( grouped['neg_count'] + grouped['pos_count'] )) \
                     .withColumn('pos_rate', grouped['pos_count'] / ( grouped['neg_count'] + grouped['pos_count'] ))
    return grouped

In [None]:
def reshape_df(df, cols_to_group):
    df0 = df.select(*cols_to_group, 'neg_rate') \
            .withColumnRenamed('neg_rate', 'rate') \
            .orderBy('rate', ascending=True) \
            .withColumn('rate', F.col('rate')*100) \
            .withColumn('label', F.lit('negative')) \
            .toPandas()
    df1 = df.select(*cols_to_group, 'pos_rate') \
            .withColumnRenamed('pos_rate', 'rate') \
            .orderBy('rate', ascending=True) \
            .withColumn('rate', F.col('rate')*100) \
            .withColumn('label', F.lit('non-negative')) \
            .toPandas()
    df_res = pd.concat([df0, df1], axis=0, ignore_index=True)
    return df_res

In [18]:
def custom_bar_plot(data, x, y, hue, fs=18, k=1, aspect_ratio=0.5, x_rotation=45,
                    x_vals=None, show_values=False, ylim=(0, 100)):
    sns.set_style("darkgrid")
    with rc_context({'font.size': fs * k}):
        plt.figure(figsize=(k * 16, k * 16 * aspect_ratio))
        if x_vals is not None:
          plot = sns.barplot(data=data[data[x].isin(x_vals)], x=x, y=y, hue=hue)
        else:
          plot = sns.barplot(data=data, x=x, y=y, hue=hue)
        if show_values:
          for container in plot.containers:
            plot.bar_label(container, fmt='%.0f')
        plt.ylim(*ylim)
        plt.xticks(rotation=x_rotation)
        plt.xlabel(x)
        plt.ylabel(y)
        plt.legend()
        plt.show()

In [None]:
def smooth(scalars, weight=0.9, reverse=False):
    """
    TODO
    """
    if reverse:
        scalars = list(np.array(scalars)[::-1])
    last = scalars[0]
    smoothed = list()
    for point in scalars:
        smoothed_val = last * weight + (1 - weight) * point
        smoothed.append(smoothed_val)
        last = smoothed_val
    if reverse:
        smoothed = list(np.array(smoothed)[::-1])
    return smoothed

In [None]:
def custom_line_plots(ids_start, ids_stop, groups, data, x, y, hue,
              fs=18, lw=5, x_label=None, y_label=None, k=1, aspect_ratio=0.5,
              weight=None, ylim=(0, 100), palette='bright', reverse=True):
    nrows = len(ids_start)
    with rc_context({'lines.linewidth': lw * k, 'font.size': fs * k}):
        fig, axs = plt.subplots(nrows=nrows,
                               figsize=(k * 16, k * 16 * aspect_ratio * nrows))
        sns.set_style("darkgrid")
        for (ax, id_start, id_stop) in zip(axs, ids_start, ids_stop):
            data2 = data[data[hue].isin(groups[id_start:id_stop])]
            if weight is not None:
              for source in groups[id_start:id_stop]:
                data2.loc[data2[hue]==source, y] = smooth(data2.loc[data2[hue]==source, y].tolist(), weight=weight,
                                                          reverse=reverse)
            sns.lineplot(ax=ax, data=data2, x=x, y=y, hue=hue, palette=palette)
            if x_label is not None: ax.set_xlabel(x_label)
            if y_label is not None: ax.set_ylabel(y_label)
            ax.set_ylim(*ylim)
        fig.tight_layout()
        plt.show()

In [None]:
@F.udf(returnType=T.IntegerType())
def get_year(date_in_str):
    return int(date_in_str[:4])

@F.udf(returnType=T.DatetimeType()) # not sure what to pick
def get_year_and_month(date_in_str):
    return datetime.datetime.strptime('/'.join(date_in_str.split('/')[:2]), '%Y/%m')

### title

In [None]:
main = lem_main_df.select('date_parsed','type','source','title_id')
main = main.join(title, main.title_id==title.id, how='left') \
           .drop('title_id', 'id') \
           .withColumn('source_group', sources_to_groups(F.col('source'))) \
           .withColumn('year', get_year(F.col('date_parsed'))) \
           .withColumn('date_parsed', get_year_and_month(F.col('date_parsed')))

In [None]:
grouped_by_year = grouper(main=main, cols_to_group=['year']).toPandas()
grouped_by_source = grouper(main=main, cols_to_group=['source']).toPandas()
grouped_by_sourceGroup = grouper(main=main, cols_to_group=['source_group']).toPandas()
grouped_by_type = grouper(main=main, cols_to_group=['type'], max_groups=10).toPandas()
grouped_by_year_and_source = grouper(main=main, cols_to_group=['source','year']).toPandas()

In [None]:
grouped_by_year = reshape_df(df=grouped_by_year, cols_to_group=['year'])
grouped_by_source = reshape_df(df=grouped_by_year, cols_to_group=['source'])
grouped_by_sourceGroup = reshape_df(df=grouped_by_sourceGroup, cols_to_group=['source_group'])
grouped_by_type = reshape_df(df=grouped_by_year, cols_to_group=['type'])
grouped_by_year_and_source = reshape_df(df=grouped_by_year, cols_to_group=['source','year'])

In [None]:
custom_bar_plot(data=grouped_by_type, x='type', y='rate', hue='label', fs=18, aspect_ratio=0.4, x_rotation=45, ylim=(0, 100))

In [None]:
custom_bar_plot(data=grouped_by_year, x='year', y='rate', hue='label', fs=18, aspect_ratio=0.4, x_vals=[k for k in range(2014, 2024)], ylim=(0, 100), show_values=True)

In [None]:
custom_bar_plot(data=grouped_by_source, x='source', y='rate', hue='label', fs=18, aspect_ratio=0.4, x_rotation=60, ylim=(0, 100))

In [None]:
custom_bar_plot(data=grouped_by_sourceGroup, x='group', y='rate', hue='label', k=0.5, fs=18*2, aspect_ratio=1, x_rotation=0, ylim=(0, 100), show_values=True)

In [None]:
custom_line_plots(ids_start=[0, 9, 18],
                 ids_stop=[9, 18, 26],
                 groups=list(grouped_by_year_and_source['source'].unique()),
                 data=grouped_by_year_and_source[grouped_by_year_and_source['date_parsed'] >= datetime.datetime.strptime('2014', '%Y')],
                 x='date_parsed',
                 y='neg_rate',
                 hue='source',
                 fs=18, lw=5,
                 x_label='date parsed',
                 y_label='Negativity Rate',
                 k=1,
                 aspect_ratio=0.3,
                 weight=0.9,
                 reverse=True)

### summary

In [None]:
main = lem_main_df.select('date_parsed','type','source','summary_id')
main = main.join(title, main.summary_id==summary.id, how='left') \
           .dropna(axis=0, subset='summary_id') \
           .drop('summary_id', 'id') \
           .withColumn('source_group', sources_to_groups(F.col('source'))) \
           .withColumn('year', get_year(F.col('date_parsed'))) \
           .withColumn('date_parsed', get_year_and_month(F.col('date_parsed')))

In [None]:
grouped_by_year = grouper(main=main, cols_to_group=['year']).toPandas()
grouped_by_source = grouper(main=main, cols_to_group=['source']).toPandas()
grouped_by_sourceGroup = grouper(main=main, cols_to_group=['source_group']).toPandas()
grouped_by_type = grouper(main=main, cols_to_group=['type'], max_groups=10).toPandas()
grouped_by_year_and_source = grouper(main=main, cols_to_group=['source','year']).toPandas()

In [None]:
grouped_by_year = reshape_df(df=grouped_by_year, cols_to_group=['year'])
grouped_by_source = reshape_df(df=grouped_by_year, cols_to_group=['source'])
grouped_by_sourceGroup = reshape_df(df=grouped_by_sourceGroup, cols_to_group=['source_group'])
grouped_by_type = reshape_df(df=grouped_by_year, cols_to_group=['type'])
grouped_by_year_and_source = reshape_df(df=grouped_by_year, cols_to_group=['source','year'])

In [None]:
custom_bar_plot(data=grouped_by_type, x='type', y='rate', hue='label', fs=18, aspect_ratio=0.4, x_rotation=45, ylim=(0, 100))

In [None]:
custom_bar_plot(data=grouped_by_year, x='year', y='rate', hue='label', fs=18, aspect_ratio=0.4, x_vals=[k for k in range(2014, 2024)], ylim=(0, 100), show_values=True)

In [None]:
custom_bar_plot(data=grouped_by_source, x='source', y='rate', hue='label', fs=18, aspect_ratio=0.4, x_rotation=60, ylim=(0, 100))

In [None]:
custom_bar_plot(data=grouped_by_sourceGroup, x='group', y='rate', hue='label', k=0.5, fs=18*2, aspect_ratio=1, x_rotation=0, ylim=(0, 100), show_values=True)

In [None]:
custom_line_plots(ids_start=[0, 8, 16],
                 ids_stop=[8, 16, 24],
                 groups=list(grouped_by_year_and_source['source'].unique()),
                 data=grouped_by_year_and_source[grouped_by_year_and_source['date_parsed'] >= datetime.datetime.strptime('2014', '%Y')],
                 x='date_parsed',
                 y='neg_rate',
                 hue='source',
                 fs=18, lw=5,
                 x_label='date parsed',
                 y_label='Negativity Rate',
                 k=1,
                 aspect_ratio=0.3,
                 ylim=(-5, 105),
                 weight=0.9,
                 reverse=True)