In [2]:
import sqlite3

# Path to data base
db_path = r"C:\Users\PC\Desktop\Masterarbeit\Code\articlesWSJ.db"

# connect to the database
conn = sqlite3.connect(db_path)
c = conn.cursor()

# show tables in the database
c.execute("SELECT name FROM sqlite_master WHERE type='table'")
tables = c.fetchall()

print("Tabellen in der Datenbank:")
for t in tables:
    print(" -", t[0])

conn.close()


Tabellen in der Datenbank:
 - articles_index
 - sqlite_sequence
 - article
 - exploration


In [3]:
conn = sqlite3.connect(db_path)
c = conn.cursor()

# Show 5 articles with non-empty corpus
c.execute("""
    SELECT a.article_id, a.title, a.sub_title, a.corpus, i.link
    FROM article a
    JOIN articles_index i ON a.index_id = i.id
    WHERE a.corpus IS NOT NULL AND a.corpus != '' AND a.corpus != 'not found'
    LIMIT 5
""")
articles = c.fetchall()

for article in articles:
    print("\n Title:", article[1])
    print(" Link:", article[4])
    print(" Corpus:", article[3][:300], "...\n")

    

conn.close()



 Title: Baidu Terminates $3.6B Deal to Buy JOYY’s China Live-Streaming Business
 Link: https://www.wsj.com/business/telecom/baidu-terminates-3-6b-deal-to-buy-joyys-china-live-streaming-business-d68f9d86
 Corpus: Advertisement
BUSINESS
TELECOM
Baidu Terminates $3.6B Deal to Buy JOYY’s China Live-Streaming Business
As of the end of December, the closing conditions for the share purchase agreement had yet to be fully satisfied, Baidu said
By
P.R. Venkat
Follow
Jan. 1, 2024 6:44 pm ET
Gift unlocked article
List ...


 Title: The Military’s Phantom ‘Extremists’
 Link: https://www.wsj.com/opinion/military-extremists-report-institute-for-defense-analyses-pentagon-lloyd-austin-97619f4d
 Corpus: Advertisement
OPINION
REVIEW & OUTLOOK
Follow
The Military’s Phantom ‘Extremists’
An independent study puts to rest another false media narrative.
By
The Editorial Board
Follow
Jan. 1, 2024 5:45 pm ET
834
Gift unlocked article
Listen
(3 min)
Secretary of Defense Lloyd Austin PHOTO: SAUL LOEB/AGENCE  ..

In [4]:
# Connect to the database again to count articles with real content
conn = sqlite3.connect(db_path)
c = conn.cursor()

# count articles with non-empty corpus
c.execute("""
    SELECT COUNT(*) FROM article
    WHERE corpus IS NOT NULL AND corpus != '' AND corpus != 'not found'
""")

count = c.fetchone()[0]
print(f" Artikel mit echtem Inhalt: {count}")

conn.close()

 Artikel mit echtem Inhalt: 18472


In [5]:
import sqlite3
import pandas as pd

# Pfad zur DB
db_path =  r"C:\Users\PC\Desktop\Masterarbeit\Code\articlesWSJ.db"
conn = sqlite3.connect(db_path)
c = conn.cursor()

# SQL: echte Artikel zählen, nach Datum gruppiert
query = """
SELECT 
    ai.year, ai.month, ai.day, COUNT(*) as count
FROM article a
JOIN articles_index ai ON a.index_id = ai.id
WHERE a.corpus IS NOT NULL AND a.corpus != '' AND a.corpus != 'not found'
GROUP BY ai.year, ai.month, ai.day
ORDER BY ai.year, ai.month, ai.day
"""

df = pd.read_sql_query(query, conn)
conn.close()

# Datum als echte Spalte formatieren
df['date'] = pd.to_datetime(df['year'] + '-' + df['month'].str.zfill(2) + '-' + df['day'].str.zfill(2))
df = df[['date', 'count']].sort_values('date')

# ✅ Zeige Ergebnis
print(df[df['count'] < 30])
      



          date  count
5   2024-01-14     25
131 2024-02-17     26
132 2024-02-18     26
135 2024-02-20     28
138 2024-02-24     25
176 2024-04-27     27
212 2024-05-04     29
196 2024-05-18     26
201 2024-05-22     10
204 2024-05-25     25
205 2024-05-26     20
218 2024-06-01     28
224 2024-06-15     24
232 2024-06-22     28
239 2024-06-29     28
270 2024-07-05      1
260 2024-07-20     27
268 2024-07-28     14
269 2024-07-29      2
271 2024-08-11      1
272 2024-08-16      1
273 2024-08-22      1
278 2024-08-31     28
285 2024-09-01     27
57  2024-10-05     26
34  2024-10-12     29
41  2024-10-19     23
49  2024-10-26     28
91  2024-11-09     26
69  2024-11-16     25
77  2024-11-23     24
85  2024-11-30     25
120 2024-12-07     29
97  2024-12-14     21
105 2024-12-21     21
112 2024-12-28     18


In [6]:

# Verbindung aufbauen
conn = sqlite3.connect(db_path)

# SQL: Artikel zählen pro Tag (nur mit echtem Inhalt)
query = """
SELECT 
    ai.year, ai.month, ai.day,
    COUNT(DISTINCT a.article_id) as article_count
FROM article a
JOIN articles_index ai ON a.index_id = ai.id
WHERE a.corpus IS NOT NULL AND a.corpus != '' AND a.corpus != 'not found'
GROUP BY ai.year, ai.month, ai.day
HAVING article_count < 30
ORDER BY ai.year, ai.month, ai.day
"""


df = pd.read_sql_query(query, conn)
conn.close()

# Formatierte Datumsspalte
df['date'] = pd.to_datetime(df['year'] + '-' + df['month'].str.zfill(2) + '-' + df['day'].str.zfill(2))
df = df[['date', 'article_count']]

# Ergebnis anzeigen
print("📅 Tage mit weniger als 30 Artikeln:")
print(df)


📅 Tage mit weniger als 30 Artikeln:
         date  article_count
0  2024-01-14             25
1  2024-10-12             29
2  2024-10-19             23
3  2024-10-26             28
4  2024-10-05             26
5  2024-11-16             25
6  2024-11-23             24
7  2024-11-30             25
8  2024-11-09             26
9  2024-12-14             21
10 2024-12-21             21
11 2024-12-28             18
12 2024-12-07             29
13 2024-02-17             26
14 2024-02-18             26
15 2024-02-20             28
16 2024-02-24             25
17 2024-04-27             27
18 2024-05-18             26
19 2024-05-22             10
20 2024-05-25             25
21 2024-05-26             20
22 2024-05-04             29
23 2024-06-01             28
24 2024-06-15             24
25 2024-06-22             28
26 2024-06-29             28
27 2024-07-20             27
28 2024-07-28             14
29 2024-07-29              2
30 2024-07-05              1
31 2024-08-11              1
32 2024

In [4]:
import sqlite3
import pandas as pd

# Pfad zur DB
db_path =  r"C:\Users\PC\Desktop\Masterarbeit\Code\articlesWSJ.db"
conn = sqlite3.connect(db_path)
c = conn.cursor()

# SQL: Artikel zählen pro Tag (nur mit echtem Inhalt)
query = """
SELECT 
    ai.year, ai.month, ai.day,
    COUNT(*) as article_count
FROM article a
JOIN articles_index ai ON a.index_id = ai.id
GROUP BY ai.year, ai.month, ai.day
ORDER BY ai.year, ai.month, ai.day
"""

df = pd.read_sql_query(query, conn)
conn.close()

# Formatierte Datumsspalte
df['date'] = pd.to_datetime(df['year'] + '-' + df['month'].str.zfill(2) + '-' + df['day'].str.zfill(2))


# Ergebnis anzeigen
print(df[df['date'] == '2024-08-25'][['date', 'article_count']]) 

Empty DataFrame
Columns: [date, article_count]
Index: []


In [11]:
import pandas as pd
import sqlite3

# Verbindung aufbauen
# Pfad zur DB
db_path =  r"C:\Users\PC\Desktop\Masterarbeit\Code\articlesWSJ.db"
conn = sqlite3.connect(db_path)

# SQL: Lade alle Tage mit weniger als 30 Artikeln
query = """
SELECT 
    ai.year, ai.month, ai.day,
    COUNT(DISTINCT a.article_id) as article_count
FROM articles_index ai
LEFT JOIN article a ON a.index_id = ai.id
    AND a.corpus IS NOT NULL AND a.corpus != '' AND a.corpus != 'not found'
GROUP BY ai.year, ai.month, ai.day
HAVING article_count < 30
ORDER BY ai.year, ai.month, ai.day;
"""

# In DataFrame laden
df = pd.read_sql_query(query, conn)
conn.close()

# Datum erzeugen
df['month'] = df['month'].astype(str).str.zfill(2)
df['day'] = df['day'].astype(str).str.zfill(2)
df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'] + '-' + df['day'])


# Ausgabe
print("Tage mit weniger als 30 Artikeln:")
print(df[['date', 'article_count']])


Tage mit weniger als 30 Artikeln:
         date  article_count
0  2024-01-14             25
1  2024-10-12             29
2  2024-10-19             23
3  2024-10-26             28
4  2024-10-05             26
5  2024-11-16             25
6  2024-11-23             24
7  2024-11-30             25
8  2024-11-09             26
9  2024-12-14             21
10 2024-12-21             21
11 2024-12-28             18
12 2024-12-07             29
13 2024-02-17             26
14 2024-02-18             26
15 2024-02-24             25
16 2024-03-02             29
17 2024-03-30             28
18 2024-04-27             27
19 2024-05-18             26
20 2024-05-25             25
21 2024-05-26             20
22 2024-05-04             29
23 2024-06-01             28
24 2024-06-15             24
25 2024-06-22             28
26 2024-06-29             28
27 2024-07-20             27
28 2024-07-06             25
29 2024-08-18              8
30 2024-08-19              0
31 2024-08-02              0
32 2024-0

In [5]:
import sqlite3
import time
from web_scrap import Search4Articles
import pandas as pd

# Absoluter Pfad zur DB
DB_PATH = r"C:\Users\PC\Desktop\Masterarbeit\Code\articlesWSJ.db"

# Verbindung aufbauen
conn = sqlite3.connect(DB_PATH)

# SQL: Artikel zählen pro Tag (nur mit echtem Inhalt)
query = """
SELECT ai.year, ai.month, ai.day,
    COUNT(DISTINCT a.article_id) as article_count
FROM articles_index ai
LEFT JOIN article a ON a.index_id = ai.id
    AND a.corpus IS NOT NULL AND a.corpus != '' AND a.corpus != 'not found'
GROUP BY ai.year, ai.month, ai.day
HAVING article_count < 30
ORDER BY ai.year, ai.month, ai.day
"""

# Lade Daten in DataFrame
df = pd.read_sql_query(query, conn)
conn.close()

# Datumsspalte erzeugen 
df['month'] = df['month'].astype(str).str.zfill(2)
df['day'] = df['day'].astype(str).str.zfill(2)
df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'] + '-' + df['day'])

# 📦 Erzeuge Liste von (year, month, day)
# Ensure values are int
df[['year', 'month', 'day']] = df[['year', 'month', 'day']].astype(int)
custom_days = list(df[['year', 'month', 'day']].itertuples(index=False, name=None))

# Ausgabe zur Kontrolle
print("📅 Tage mit weniger als 30 Artikeln:")
print(df[['date', 'article_count']])


def has_enough_articles(year, month, day, db_name=DB_PATH, max_articles=30):
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    c.execute("""
        SELECT COUNT(DISTINCT a.article_id)
        FROM articles_index ai
        LEFT JOIN article a ON a.index_id = ai.id
            AND a.corpus IS NOT NULL AND a.corpus != '' AND a.corpus != 'not found'
        WHERE ai.year = ? AND ai.month = ? AND ai.day = ?
    """, (year, month, day))
    count = c.fetchone()[0]
    conn.close()
    return count >= max_articles

if __name__ == "__main__":
    max_articles_per_day = 30

    print("📅 Starte gezieltes Scraping für benutzerdefinierte Tage...")

    sa = Search4Articles(db_name=DB_PATH)

    for (year, month, day) in custom_days:
        print(f"🔍 Überprüfe {year}-{month:02d}-{day:02d}...")

        if has_enough_articles(year, month, day, max_articles=max_articles_per_day):
            print("✅ Genug Artikel vorhanden – überspringe.")
            continue

        print("🚀 Lade fehlende Artikel...")
        sa.navigation(max_per_day=max_articles_per_day, year=year, month=month, day=day)
        print("✅ Fertig mit diesem Tag.\n")
        time.sleep(10)

    sa.driver.quit()
    print("🏁 Alle angegebenen Tage wurden überprüft.")

📅 Tage mit weniger als 30 Artikeln:
         date  article_count
0  2024-01-14             25
1  2024-10-12             29
2  2024-10-19             23
3  2024-10-26             28
4  2024-10-05             26
5  2024-11-16             25
6  2024-11-23             24
7  2024-11-30             25
8  2024-11-09             26
9  2024-12-14             21
10 2024-12-21             21
11 2024-12-28             18
12 2024-12-07             29
13 2024-02-17             26
14 2024-02-18             26
15 2024-02-24             25
16 2024-03-02             29
17 2024-03-30             28
18 2024-04-27             27
19 2024-05-18             26
20 2024-05-25             25
21 2024-05-26             20
22 2024-05-04             29
23 2024-06-01             28
24 2024-06-15             24
25 2024-06-22             28
26 2024-06-29             28
27 2024-07-20             27
28 2024-07-06             25
29 2024-08-18              8
30 2024-08-19              0
31 2024-08-02              0
32 2024

In [9]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt

# DB path
DB_PATH = r"C:\Users\PC\Desktop\Masterarbeit\Code\articlesWSJ.db"

# Connect to DB
conn = sqlite3.connect(DB_PATH)

# 🧠 Step 1: Tage mit weniger als 30 Artikeln (Problemtage)
query_problem_days = """
SELECT ai.year, ai.month, ai.day,
       COUNT(DISTINCT a.article_id) as article_count
FROM articles_index ai
LEFT JOIN article a ON ai.id = a.index_id
    AND a.corpus IS NOT NULL AND a.corpus != '' AND a.corpus != 'not found'
GROUP BY ai.year, ai.month, ai.day
HAVING article_count < 30
"""

df_problems = pd.read_sql_query(query_problem_days, conn)

# 🧠 Step 2: Zähle, wie viele Links es für diese Tage gibt
query_links = """
SELECT year, month, day, COUNT(*) as num_links
FROM articles_index
GROUP BY year, month, day
"""

df_links = pd.read_sql_query(query_links, conn)
conn.close()

# 🧠 Step 3: Join beide DataFrames auf (year, month, day)
df_merged = pd.merge(
    df_problems[['year', 'month', 'day']],
    df_links,
    on=['year', 'month', 'day'],
    how='left'
)

# Erzeuge Datums-Spalte
df_merged['date'] = pd.to_datetime(
    df_merged['year'].astype(str) + '-' +
    df_merged['month'].astype(str).str.zfill(2) + '-' +
    df_merged['day'].astype(str).str.zfill(2)
)

# Show all rows
pd.set_option('display.max_rows', None)  # zeigt alle Zeilen an

# Ergebnis anzeigen
print("📅 Link count for problematic days (article_count < 30):")
print(df_merged[['date', 'num_links']])




📅 Link count for problematic days (article_count < 30):
         date  num_links
0  2024-01-14         25
1  2024-10-12         29
2  2024-10-19         23
3  2024-10-26         28
4  2024-10-05         26
5  2024-11-16         25
6  2024-11-23         24
7  2024-11-30         25
8  2024-11-09         26
9  2024-12-14         21
10 2024-12-21         21
11 2024-12-28         18
12 2024-12-07         29
13 2024-02-17         26
14 2024-02-18         26
15 2024-02-24         25
16 2024-03-02         29
17 2024-03-30         28
18 2024-04-27         27
19 2024-05-18         26
20 2024-05-25         25
21 2024-05-26         20
22 2024-05-04         29
23 2024-06-01         28
24 2024-06-15         24
25 2024-06-22         28
26 2024-06-29         28
27 2024-07-20         27
28 2024-07-06         25
29 2024-08-18         42
30 2024-08-19         63
31 2024-08-02        102
32 2024-08-20         70
33 2024-08-21         85
34 2024-08-22         95
35 2024-08-23         92
36 2024-08-24      

In [11]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt

# DB path
DB_PATH = r"C:\Users\PC\Desktop\Masterarbeit\Code\articlesWSJ.db"

# Connect to DB
conn = sqlite3.connect(DB_PATH)

# 🧠 Step 1: Tage mit weniger als 30 Artikeln (Problemtage)
query_problem_days = """
SELECT ai.year, ai.month, ai.day,
       COUNT(DISTINCT a.article_id) as article_count
FROM articles_index ai
LEFT JOIN article a ON ai.id = a.index_id
    AND a.corpus IS NOT NULL AND a.corpus != '' AND a.corpus != 'not found'
GROUP BY ai.year, ai.month, ai.day
HAVING article_count < 30
"""

df_problems = pd.read_sql_query(query_problem_days, conn)

# 🧠 Step 2: Zähle, wie viele Links es für diese Tage gibt
query_links = """
SELECT year, month, day, COUNT(*) as num_links
FROM articles_index
GROUP BY year, month, day
"""

df_links = pd.read_sql_query(query_links, conn)
conn.close()

# 🧠 Step 3: Join beide DataFrames auf (year, month, day)
df_merged = pd.merge(
    df_problems[['year', 'month', 'day']],
    df_links,
    on=['year', 'month', 'day'],
    how='left'
)

# Erzeuge Datums-Spalte
df_merged['date'] = pd.to_datetime(
    df_merged['year'].astype(str) + '-' +
    df_merged['month'].astype(str).str.zfill(2) + '-' +
    df_merged['day'].astype(str).str.zfill(2)
)

# Show all rows
pd.set_option('display.max_rows', None)  # zeigt alle Zeilen an

# Ergebnis anzeigen
print("📅 Link count for problematic days (article_count < 30):")
print(df_merged[['date', 'num_links']])



def reset_failed_articles(days, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    for year, month, day in days:
        year, month, day = int(year), int(month), int(day)
        print(f"♻️ Resetting failed articles for {year}-{month:02d}-{day:02d}...")
            # Lösche Artikel ohne Inhalt
        cursor.execute("""
            DELETE FROM article
            WHERE index_id IN (
                SELECT id FROM articles_index
                WHERE year = ? AND month = ? AND day = ?
            )
            AND (corpus IS NULL OR corpus = '' OR corpus = 'not found')
        """, (year, month, day))

        # Setze scanned_status auf 0 für diese IDs
        cursor.execute("""
            UPDATE articles_index
            SET scanned_status = 0
            WHERE year = ? AND month = ? AND day = ?
            AND id NOT IN (
                SELECT index_id FROM article
            )
        """, (year, month, day))

    conn.commit()
    conn.close()
    print("✅ Reset completed.")

reset_failed_articles(
    list(df_merged[['year', 'month', 'day']].itertuples(index=False, name=None)),
    DB_PATH
)


📅 Link count for problematic days (article_count < 30):
         date  num_links
0  2024-01-14         25
1  2024-10-12         29
2  2024-10-19         23
3  2024-10-26         28
4  2024-10-05         26
5  2024-11-16         25
6  2024-11-23         24
7  2024-11-30         25
8  2024-11-09         26
9  2024-12-14         21
10 2024-12-21         21
11 2024-12-28         18
12 2024-12-07         29
13 2024-02-17         26
14 2024-02-18         26
15 2024-02-20         80
16 2024-02-21         87
17 2024-02-24         25
18 2024-03-16         30
19 2024-03-17         43
20 2024-03-18         87
21 2024-03-19         83
22 2024-03-02         29
23 2024-03-20         96
24 2024-03-21        106
25 2024-03-22        103
26 2024-03-23         35
27 2024-03-24         47
28 2024-03-25         77
29 2024-03-26         93
30 2024-03-27        102
31 2024-03-28         99
32 2024-03-29         89
33 2024-03-03         52
34 2024-03-30         28
35 2024-03-31         43
36 2024-03-04      